标签:sel python get png int 发布 content sele mat
import requests
import re
from bs4 import BeautifulSoup
from datetime import datetime
newsurl = ‘http://news.gzcc.cn/html/xiaoyuanxinwen/‘
res = requests.get(newsurl) # 返回response对象
res.encoding = ‘utf-8‘
soup = BeautifulSoup(res.text,‘html.parser‘)
def getNewDetail(newsUrl):
for news in soup.select(‘li‘):
if len(news.select(‘.news-list-title‘))>0:
t=news.select(‘.news-list-title‘)[0].text #标题
a=news.select(‘a‘)[0].attrs[‘href‘] #链接
res = requests.get(a)
res.encoding = ‘utf-8‘
soupd = BeautifulSoup(res.text, ‘html.parser‘)
content = soupd.select(‘#content‘)[0].text
description = news.select(‘.news-list-description‘)[0].text
resd=requests.get(a)
resd.encoding=‘utf-8‘
soupd=BeautifulSoup(resd.text,‘html.parser‘)
info=soupd.select(‘.show-info‘)[0].text
d=info.lstrip(‘发布时间:‘)[:19]
dt=datetime.strptime(d,‘%Y-%m-%d %H:%M:%S‘)
author=info[info.find(‘作者:‘):].split()[0].lstrip(‘作者:‘)
source=info[info.find(‘来源:‘):].split()[0].lstrip(‘来源:‘)
photo=info[info.find(‘摄影:‘):].split()[0].lstrip(‘摄影:‘)
print("新闻标题:",t)
print("链接:",a)
print("发布时间:",dt)
print("作者:",author)
print("来源:",source)
print("摄影:",photo)
print("描述:",description)
getClickCount(a)
print("正文:", content)
break
def getClickCount(newsUrl):
clickUrl = ‘http://oa.gzcc.cn/api.php?op=count&id=9183&modelid=80‘
count = requests.get(clickUrl).text.split(‘.html‘)[-1].lstrip("(‘").rstrip("‘);")
print("点击次数:", count)
re.match(‘http://news.gzcc.cn/html/2018/xiaoyuanxinwen(.*).html‘, newsUrl).group(1).split(‘/‘)[1]
print(‘新闻编号:‘, re.search(‘\_(.*).html‘, newsUrl).group(1))
getNewDetail(newsurl)

标签:sel python get png int 发布 content sele mat
原文地址:https://www.cnblogs.com/1996-yxl/p/8747213.html