标签:pool getc search 列表 pandas 获取 news amp pen
import re
import xlwt
import time
import pandas
import requests
from multiprocessing import Process,Pool
from bs4 import BeautifulSoup
def getClickCount(newUrl):
"""
获取新闻的点击次数
:param newUrl:
:return: int
"""
new_id = re.findall(r‘\_(.*).html‘,newUrl)
new_id = new_id[0].split(‘/‘)[1]
url = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(new_id)
content = requests.get(url)
clickCount = int(re.search("hits‘\).html\(‘(.*)‘\);", content.text).group(1))
return clickCount
def getNewDetail(newsUrl):
"""
获取广州商学院的新闻详情
:param newsUrl:
:return: Dict
"""
content=‘‘
web=requests.get(newsUrl)
web.encoding=‘utf-8‘
soup=BeautifulSoup(web.text, ‘html.parser‘)
structure=soup.find(‘div‘,{‘class‘:‘show-content‘}) #正文
for string in structure.stripped_strings:
content=content+string
list=[]
info=soup.find(‘div‘,{‘class‘:‘show-info‘})
info=info.text.replace(‘\xa0‘,‘n‘).split(‘n‘)#细节信息
for string in info:
if len(string)>3:
if string.find(‘发布时间‘)!=-1:
string=string.replace(‘:‘,‘:‘,1)
string=string.strip()
if string.find(‘次‘)!=-1:
string=‘点击:{}次‘.format(getClickCount(newsUrl))
list.append(string.split(‘:‘))
list=dict(list)
list[‘链接‘]=newsUrl
list[‘正文‘]=content
list[‘发布时间‘]=time.strptime(list[‘发布时间‘],‘%Y-%m-%d %H:%M:%S‘)
return list
def getNewsUrl(url):
"""
获取广州商学院新闻列表页的所有新闻页的链接
:param url:
:return: List
"""
newsList=[]
web=requests.get(url)
web.encoding=‘utf-8‘
soup=BeautifulSoup(web.text,‘html.parser‘)
soup=soup.find(‘ul‘,{‘class‘:‘news-list‘})
for child in soup.children:
if len(child)>1:
newsList.append(child.a[‘href‘])
return newsList
def getPage(url):
"""
获取广州商学院新闻页数
:param url:
:return: int
"""
web=requests.get(url)
web.encoding=‘utf-8‘
soup=BeautifulSoup(web.text,‘html.parser‘)
soup=soup.find(‘a‘,{‘class‘:‘a1‘}).string[:-1]
page=int(soup)//10+1
return page
def getnews(url):
print(‘start in %s‘%url[39:])
newsurllist = getNewsUrl(url)
for url in newsurllist:
news.append(getNewDetail(url))
print(‘ end ‘ ,end=‘‘)
if __name__==‘__main__‘:
news=[]
url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
newsurl=getNewsUrl(url)
page=getPage(url)
for i in range(1,page+1):
if i==1:
url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
else:
url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i)
getnews(url)
df=pandas.DataFrame(news)
df.to_excel(‘gzccnews.xls‘)
标签:pool getc search 列表 pandas 获取 news amp pen
原文地址:https://www.cnblogs.com/127li/p/8855041.html