码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫大作业

时间:2018-04-24 00:18:31      阅读:208      评论:0      收藏:0      [点我收藏+]

标签:code   encoding   文档   from   off   html   gen   cloud   pre   

 

import jieba.analyse
from PIL import Image,ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
import requests
from urllib import parse
from bs4 import BeautifulSoup

def getWord():
    lyric = ‘‘
    # 打开文档,进行编译,防止错误
    f = open(youku.txt, r, encoding=utf-8)
    # 将文档里面的数据进行单个读取,便于生成词云
    for i in f:
        lyric += f.read()
    #     进行分析
    result = jieba.analyse.textrank(lyric, topK=50, withWeight=True)
    keywords = dict()
    for i in result:
        keywords[i[0]] = i[1]
    print(keywords)

    # 获取词云生成所需要的模板图片
    image = Image.open(789.jpg)
    graph = np.array(image)
    # 进行词云的设置
    wc = WordCloud(font_path=./fonts/simhei.ttf, background_color=White, max_words=50, mask=graph)
    wc.generate_from_frequencies(keywords)
    image_color = ImageColorGenerator(graph)
    plt.imshow(wc)
    plt.imshow(wc.recolor(color_func=image_color))
    plt.axis("off")
    plt.show()
    wc.to_file(dream.png)

name = youku
unique = parse.quote(name)
print(unique)
url = http://list.youku.com/category/show/c_96_g_%E7%A7%91%E5%B9%BB_s_1_d_1.html?spm=a2hmv.20009921.m_86982.5~5~5!3~1~3!5~A
print(url)

res = requests.get(url)
res.encoding = utf-8
soup = BeautifulSoup(res.text, html.parser)
titles = soup.select(".info-list .title a")
for i in range(0,len(titles)):
    title = titles[i].text
    f = open(youku.txt, a, encoding=utf-8)
    f.write(title)
    f.write("\n")
    f.close()
    # print(title)
getWord()

技术分享图片技术分享图片

爬虫大作业

标签:code   encoding   文档   from   off   html   gen   cloud   pre   

原文地址:https://www.cnblogs.com/darkhate/p/8922674.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!