码迷,mamicode.com
首页 > 其他好文 > 详细

scrapy框架项目:抓取链家 全武汉的二手房信息

时间:2018-09-27 01:53:07      阅读:148      评论:0      收藏:0      [点我收藏+]

标签:使用   hang   rap   nan   https   uda   rom   tps   font   

import scrapy
import re
from collections import Counter
from lianjia.items import LianjiaItem


class LianjiaSpiderSpider(scrapy.Spider):
name = ‘lianjia_spider‘
allowed_domains = [‘wh.lianjia.com‘]
start_urls = [‘https://wh.lianjia.com/ershoufang/baibuting/‘]

def parse(self, response):
rsp = (response.body.decode("utf-8"))
#print(response.xpath("//div"))
item = LianjiaItem()
info_list = response.xpath("//div//ul//li[@class=‘clear LOGCLICKDATA‘]")
#print(len(info_list))
#print(info_list)
for i in info_list:
#print(i)

item["xiaoqu_name"] = i.xpath(‘.//div[@class="houseInfo"]//a[@target="_blank"]/text()‘).extract()[0]
#print(xiaoqu_name)

#xiaoqu_link = i.xpath(‘.//div[@class="houseInfo"]//@href‘).extract()[0]
#print(xiaoqu_link)

item["name"] = i.xpath(‘.//div[@class="info clear"]//a/text()‘).extract()[0]
#print(name)

item["area"] = i.xpath(‘.//div[@class="info clear"]//div[@class="positionInfo"]//a/text()‘).extract()[0]
#print(area)

item["link"] = i.xpath(".//div[@class=‘title‘]//@href").extract()[0]
#print(link)

item["summary"] = i.xpath(‘.//div[@class="houseInfo"]/text()‘).extract()[0] # summary 总结 朝向 装修等,电梯等
#print(summary)

item["floor"] = i.xpath(‘.//div[@class="info clear"]//div[@class="positionInfo"]/text()‘).extract()[0]
#print(floor)

item["zongjia"] = i.xpath(‘.//div[@class="info clear"]//div[@class="totalPrice"]//span/text()‘).extract()[0]# + "万" #组合上单位
#print(zongjia)

item["danjia"] = i.xpath(‘.//div[@class="info clear"]//div[@class="unitPrice"]//span/text()‘).extract()[0]
#print(danjia)

yield item

     #经过分析发现,如果直接在 武昌 汉口 这样的大区域下搜索 ,最多显示30页数据,所以想要完全爬取,必须把所有小区域的链接挨个遍历
     area_list = ["baibuting","dazhilu","dijiao","erqi2","houhu","huangpuyongqing","qianjinjianghan","sanyanglu","tazihu","yucaihuaqiao",
"changqinglu","changfengchangmatou","changganglu","taibeixiangganglu","tangjiadun","wuguangwansongyuan","xinhualuwanda","yangchahu",
"baofengchongren","changfengchangmatou","cbdxibeihu","gutian","hanzhengjie","jixian2","wujiashan","zongguan",
"changqinghuayuan","dongxihuqita","jinyinhu","jiangjunlu","baishazhou","chuhehanjie","donghudongting","jiedaokou","jiyuqiao","shuiguohu","shouyi","shahu",
"tuanjiedadao","wuchanghuochezhan","xudong","yangyuan","zhongbeilu","zhongnandingziqiao","zhuodaoquan","hongshanqita","qingshan1","huquanyangjiawan","luoshinanlu",
"laonanhu","nanhuwoerma","xinnanhu","qilimiao","sixin","wangjiawan","zhongjiacun","guanxichangzhi","guangguguangchang","guanshandadao","guanggunan","guanggudong",
"huakeda","jinronggang","minzudadao","sanhuannan","canglongdao","jiangxiaqita","miaoshan","wenhuadadao","caidianqita","dunkou",
"hankoubei","huangbeiqita","panlongcheng","qianchuan","xinzhouqita","yangluo"]

     #counter = Counter(area_list) #查询列表中是否有重复
#print(counter)
     
     #遍历所有区域后,再遍历0~30页 这样才能确保网站上的所有数据都被爬取,否则信息严重缺失
for i in area_list:
for num in range(0,30):
yield scrapy.Request("https://wh.lianjia.com/ershoufang/"+ i +"/pg"+ str(num), callback=self.parse)

items和pipelines无特别之处,按照常规写即可使用。

scrapy框架项目:抓取链家 全武汉的二手房信息

标签:使用   hang   rap   nan   https   uda   rom   tps   font   

原文地址:https://www.cnblogs.com/cwkcwk/p/9710827.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!