广州商学院新闻获取

时间：2018-04-16 12:04:50 阅读：195 评论：0 收藏：0 [点我收藏+]

标签：pool getc search 列表 pandas 获取 news amp pen

import re
import xlwt
import time
import pandas
import requests
from multiprocessing import Process,Pool
from bs4 import BeautifulSoup


def getClickCount(newUrl):

    """
    获取新闻的点击次数
    :param newUrl:
    :return: int
    """
    new_id = re.findall(r‘\_(.*).html‘,newUrl)
    new_id = new_id[0].split(‘/‘)[1]
    url = ‘http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80‘.format(new_id)
    content = requests.get(url)
    clickCount = int(re.search("hits‘\).html\(‘(.*)‘\);", content.text).group(1))
    return clickCount

def getNewDetail(newsUrl):

    """
    获取广州商学院的新闻详情
    :param newsUrl:
    :return: Dict
    """
    content=‘‘
    web=requests.get(newsUrl)
    web.encoding=‘utf-8‘
    soup=BeautifulSoup(web.text, ‘html.parser‘)
    structure=soup.find(‘div‘,{‘class‘:‘show-content‘}) #正文
    for string in structure.stripped_strings:
        content=content+string

    list=[]
    info=soup.find(‘div‘,{‘class‘:‘show-info‘})
    info=info.text.replace(‘\xa0‘,‘n‘).split(‘n‘)#细节信息
    for string in info:
        if len(string)>3:
            if string.find(‘发布时间‘)!=-1:
                string=string.replace(‘:‘,‘：‘,1)
                string=string.strip()
            if string.find(‘次‘)!=-1:
                string=‘点击：{}次‘.format(getClickCount(newsUrl))

            list.append(string.split(‘：‘))
    list=dict(list)
    list[‘链接‘]=newsUrl
    list[‘正文‘]=content
    list[‘发布时间‘]=time.strptime(list[‘发布时间‘],‘%Y-%m-%d %H:%M:%S‘)
    return list
def getNewsUrl(url):

    """
    获取广州商学院新闻列表页的所有新闻页的链接
    :param url:
    :return: List
    """

    newsList=[]
    web=requests.get(url)
    web.encoding=‘utf-8‘

    soup=BeautifulSoup(web.text,‘html.parser‘)
    soup=soup.find(‘ul‘,{‘class‘:‘news-list‘})
    for child in soup.children:
        if len(child)>1:
            newsList.append(child.a[‘href‘])
    return newsList

def getPage(url):

    """
    获取广州商学院新闻页数
    :param url:
    :return: int
    """
    web=requests.get(url)
    web.encoding=‘utf-8‘

    soup=BeautifulSoup(web.text,‘html.parser‘)
    soup=soup.find(‘a‘,{‘class‘:‘a1‘}).string[:-1]

    page=int(soup)//10+1

    return page

def getnews(url):
    print(‘start in %s‘%url[39:])
    newsurllist = getNewsUrl(url)
    for url in newsurllist:
        news.append(getNewDetail(url))
    print(‘ end ‘ ,end=‘‘)

if __name__==‘__main__‘:

    news=[]

    url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
    newsurl=getNewsUrl(url)
    page=getPage(url)
    for i in range(1,page+1):
        if i==1:
            url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
        else:
            url=‘http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html‘.format(i)
        getnews(url)
    df=pandas.DataFrame(news)
    df.to_excel(‘gzccnews.xls‘)

广州商学院新闻获取

标签：pool getc search 列表 pandas 获取 news amp pen

原文地址：https://www.cnblogs.com/127li/p/8855041.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行