标签:第二周 lse datetime mongodb god imp art lis none
四周爬虫课第二周的练习,爬取赶集网二手物品下所有物品的信息。
大致思路:
1、爬取频道页url;
2、爬取商品详情页url,写入mongodb,url_list表;
3、从url_list表读取url,爬取商品信息,写入mongodb,p_info表
分成3个py文件:
1、channel_url.py,获取频道页url;
2、page_parse.py,主要是2个爬虫函数,分别完成2个数据表的写入;
3、main.py,主程序文件,也就是本文件,开启多进程,完成整个工作
最后顺利完成任务,感觉现在赶集真不行了,没多少数据。
channel_url.py文件:
import requests
from bs4 import BeautifulSoup
start_url = ‘http://bj.ganji.com/wu/‘
url_host = ‘http://bj.ganji.com‘
def get_channel_url(url):
channel_urls = []
raw_data = requests.get(url).text
soup = BeautifulSoup(raw_data,‘lxml‘)
eles = soup.select(‘div.content dt>a‘)
for e in eles:
channel_url = url_host + e.get(‘href‘)
print(channel_url)
channel_urls.append(channel_url)
return channel_urls
# channel_urls = get_channel_url(start_url)
# print(‘len(channel_urls):‘,len(channel_urls))
# 这是程序运行的结果,直接保存下来了,就不用再运行get_channel_url()了
channel_urls = ‘‘‘
http://bj.ganji.com/jiaju/
http://bj.ganji.com/rirongbaihuo/
http://bj.ganji.com/shouji/
http://bj.ganji.com/bangong/
http://bj.ganji.com/nongyongpin/
http://bj.ganji.com/jiadian/
http://bj.ganji.com/ershoubijibendiannao/
http://bj.ganji.com/ruanjiantushu/
http://bj.ganji.com/yingyouyunfu/
http://bj.ganji.com/diannao/
http://bj.ganji.com/xianzhilipin/
http://bj.ganji.com/fushixiaobaxuemao/
http://bj.ganji.com/meironghuazhuang/
http://bj.ganji.com/shuma/
http://bj.ganji.com/laonianyongpin/
http://bj.ganji.com/xuniwupin/
http://bj.ganji.com/qitawupin/
http://bj.ganji.com/ershoufree/
http://bj.ganji.com/wupinjiaohuan/
‘‘‘
page_parse.py文件:
import requests
from bs4 import BeautifulSoup
from time import sleep
from pymongo import MongoClient
client = MongoClient(‘localhost‘,27017)
ganji = client[‘ganji‘]
url_list = ganji[‘url_list‘]
p_info = ganji[‘p_info‘]
# 给定频道url,爬取此频道下所有商品的url,打印并写入mongo数据库
def get_product_url(url):
channel_url = url
page_num = 1
while True:
raw_page = requests.get(url).text
print(‘正在get网页:‘,url)
sleep(2)
soup = BeautifulSoup(raw_page,‘lxml‘)
eles = soup.select(‘a.ft-tit‘)
print(‘len(eles):‘,len(eles))
for e in eles:
p_url = e.get(‘href‘)
url_list.insert_one({‘p_url‘:p_url})
print(p_url)
if soup.select(‘a.next‘):
page_num += 1
url = channel_url + ‘o‘ + str(page_num) + ‘/‘
else:
break
# 给定商品详情页url,爬取商品具体信息,打印并写入mongo数据库
def get_product_info(url):
raw_page = requests.get(url).text
sleep(2)
soup = BeautifulSoup(raw_page,‘lxml‘)
if soup.select("p:contains(‘信息刚被删除~‘)"): # 判断商品信息是否已经删除
print(‘信息刚被删除~‘)
pass
else:
title = soup.select(‘h1.title-name‘)[0].get_text() if soup.select(‘h1.title-name‘) else None
category = list(soup.select(‘div.crumbs.routes.clearfix‘)[0].stripped_strings) if soup.select(‘div.crumbs.routes.clearfix‘) else None
date = soup.select(‘i.pr-5‘)[0].get_text().split(‘\\‘)[0].strip() if soup.select(‘i.pr-5‘) else None
price = soup.select(‘i.f22.fc-orange.f-type‘)[0].get_text() if soup.select(‘i.f22.fc-orange.f-type‘) else None
address = soup.select(‘ul.det-infor>li:nth-child(2)>a‘)[0].get_text() if soup.select(‘ul.det-infor>li:nth-child(2)>a‘) else None
p_dict = {‘title‘:title,‘category‘:category,‘date‘:date,‘price‘:price,‘address‘:address,‘url‘:url}
p_info.insert_one(p_dict)
print(p_dict)
main.py文件:
from channel_url import channel_urls # 从channel_url.py导入某变量,会把channel_url.py都执行一遍,但变量只在模块内部保留
from page_parse import get_product_url, get_product_info, url_list # 需要导入url_list
from multiprocessing import Pool
from datetime import datetime
# 从mongodb中读取商品url,返回所有商品的url
def read_all_p_urls():
all_p_urls = []
for item in url_list.find():
all_p_urls.append(item[‘p_url‘])
return all_p_urls
if __name__ == ‘__main__‘:
start_time = datetime.now()
# 不用多进程的方式,耗时会多好几倍
# for channel in channel_urls.split():
# get_product_url(channel)
pool = Pool()
#用多进程的方式,4进程和自动分配进程耗时差不多
#pool = Pool(processes=4)
# 根据channel url,获取商品url,写入mongodb
pool.map(get_product_url,channel_urls.split())
# 根据商品url,获取商品信息,写入mongodb;这一句可以跟上面那句分开执行
pool.map(get_product_info,read_all_p_urls())
end_time = datetime.now()
during = end_time - start_time
print(‘总共耗时:‘,during)
标签:第二周 lse datetime mongodb god imp art lis none
原文地址:https://www.cnblogs.com/djlbolgs/p/12539821.html