标签:form index lis lib htm 完整 rom item .text
1.选取的是4399小游戏的网址http://www.4399.com/gamehw.htm

2.网络上爬取的相关数据
import requests
from bs4 import BeautifulSoupdef get(gameurl): res = requests.get(gameurl) res.encoding=‘gb2312‘ soup = BeautifulSoup(res.text,‘html.parser‘) tm=soup.select(‘.tm_list‘)[0] #print(tm) for games in tm: try: title=games.select(‘a‘)[0].text print(title) except: passgameurl = ‘http://www.4399.com/flash/gamehw.htm‘print(get(gameurl)) 
3.进行文本分析
import requests
from bs4 import BeautifulSoupimport jiebadef get(gameurl,txt): res = requests.get(gameurl) res.encoding=‘gb2312‘ soup = BeautifulSoup(res.text,‘html.parser‘) tm=soup.select(‘.tm_list‘)[0] #print(tm) for games in tm: try: title=games.select(‘a‘)[0].text txt=txt+title #print(title) except: pass words = jieba.lcut(txt) ls = [] counts = {} for word in words: ls.append(word) if len(word) == 1: continue else: counts[word] = counts.get(word,0)+1 items = list(counts.items()) items.sort(key = lambda x:x[1], reverse = True) for i in range(25): word , count = items[i] print ("{:<5}{:>5}".format(word,count)) from wordcloud import WordCloud import matplotlib.pyplot as plt w=" ".join(words) wc=WordCloud().generate(w) plt.imshow(wc) plt.axis("off") plt.show() gameurl = ‘http://www.4399.com/flash/gamehw.htm‘txt=‘‘print(get(gameurl,txt))生成词云如下

标签:form index lis lib htm 完整 rom item .text
原文地址:http://www.cnblogs.com/0042ljc/p/7766860.html