码迷,mamicode.com
首页 > 数据库 > 详细

scrapy框架项目:抓取全部知乎用户信息,并且保存至mongodb

时间:2018-09-27 01:45:24      阅读:235      评论:0      收藏:0      [点我收藏+]

标签:setting   eve   findall   insert   object   article   ons   pipeline   nes   

import scrapy
import json,time,re
from zhihuinfo.items import ZhihuinfoItem


class ZhihuSpider(scrapy.Spider):
name = ‘zhihu‘
allowed_domains = [‘www.zhihu.com‘]
start_urls = [‘https://www.zhihu.com/api/v4/members/eve-lee-55/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20‘,]

def parse(self, response):
temp_data = json.loads(response.body.decode("utf-8"))["data"]
count = len(temp_data)
#如果用户信息数字低于18 说明已经到达最后一页
if count <= 18:
pass

#如果没有达到最后一页,则改变offset促使爬虫翻页
else:
offset = re.findall(re.compile(r‘&offset=(.*?)&‘),response.url)[0]
new_offset = int(offset) + 20
print(new_offset)
time.sleep(1)
yield scrapy.Request("https://www.zhihu.com/api/v4/members/eve-lee-55/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset="+str(new_offset)+"&limit=20",callback=self.parse,dont_filter=True)

for i in temp_data:
#print(i)
#print("***************"*10)
#print(response.url)
#print("***************" * 10)

item = ZhihuinfoItem()
item["name"] = i["name"]
item["url_token"] = i["url_token"]
item["headline"] = i["headline"]
item["follower_count"] = i["follower_count"]
item["answer_count"] = i["answer_count"]
item["articles_count"] = i["articles_count"]
item["id"] = i["id"]
item["type"] = i["type"]

with open("userinfo.txt") as f:
user_list = f.read()

#建立一个文档,把爬取过的用户信息其中的url_token写入,防止重复爬取用户
if i["url_token"] not in user_list:
with open("userinfo.txt","a") as f: #"a" 是 追加 的意思
f.write(i["url_token"]+"-----")
yield item
#print(i["url_token"])

#切换到新的用户的关注列表内
#这样爬虫就不断蔓延,理论上就可以无限爬取完所有互动性强的活跃用户。
new_url = "https://www.zhihu.com/api/v4/members/" + i["url_token"] + "/followees?include=data%5B*%5D.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=20&limit=20"
time.sleep(1)
yield scrapy.Request(url=new_url,callback=self.parse)




pipelines

import pymongo
from scrapy.conf import settings

class ZhihuinfoPipeline(object):
def __init__(self):
host = settings["MONGODB_HOST"]
port = settings["MONGODB_PORT"]
dbname = settings["MONGODB_DBNAME"]
client = pymongo.MongoClient(host=host,port=port)
tdb = client[dbname]
self.post = tdb[settings["MONGODB_DOCNAME"]]

def process_item(self, item, spider):
zhihuzhihu = dict(item)
self.post.insert(zhihuzhihu)
return item


items
import scrapy


class ZhihuinfoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
url_token = scrapy.Field()
headline = scrapy.Field()
follower_count = scrapy.Field()
answer_count = scrapy.Field()
articles_count= scrapy.Field()
id = scrapy.Field()
type = scrapy.Field()



scrapy框架项目:抓取全部知乎用户信息,并且保存至mongodb

标签:setting   eve   findall   insert   object   article   ons   pipeline   nes   

原文地址:https://www.cnblogs.com/cwkcwk/p/9710805.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!