码迷,mamicode.com
首页 > 其他好文 > 详细

麦田厦门下区信息数据爬取

时间:2019-08-26 23:10:46      阅读:95      评论:0      收藏:0      [点我收藏+]

标签:any   建筑   replace   epo   charset   mmu   safari   eve   unity   

刚开始爬取的时候没有用headers伪装成是浏览器,导致麦田北京和福州小区把我的ip给禁掉了,还好后来发现原因也还剩下厦门小区没被我弄坏,代码如下:

#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
page_url = "http://xm.maitian.cn/xqall"
headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36",
           "Referer":"http://xm.maitian.cn/esfall",
           "Connection":"keep-alive",
           "Content-Type":"text/plain; charset=utf-8"}


def get_communities_url():
    all_data =[]
    try:
        reponse = requests.get(url=page_url,headers=headers)
    except Exception as e:
        print("请求连接错误")
        raise e

    soup = BeautifulSoup(reponse.text,"lxml")
    soup = soup.find("div","list_wrap")
    tag_li = soup.find_all("li")
    for tag_li in soup.find_all("li"):
        href = tag_li.h1.a[href]
        new_url = page_url.replace("/xqall",href)
        #all_url.append(new_url)
        dict_data =get_target_info(new_url)
        if dict_data:
            all_data.append(dict_data)
    #print(all_data)
    return all_data

def get_target_info(new_url):
    # all_url = get_communities_url()
    # print(len(all_url))

    dict = {}

    try:
        reponse = requests.get(url=new_url,headers=headers)
    except Exception as e:
        print("请求连接错误")
        raise e

    #print(reponse.text)
    soup = BeautifulSoup(reponse.text,lxml)
    soup1 = soup.find("section","home_main")
    ps = soup1.find_all("p")
    # 小区均价
    community_avg = ps[0].b.string.strip()
    dict["community_avg"] =community_avg
    #待售房源
    unsold_homes = ps[1].find_all("em")[0].a.string+""
    dict["unsold_homes"] = unsold_homes
    #待租房源
    rent_homes = ps[1].find_all("em")[1].a.string + ""
    dict["rent_homes"] = rent_homes
    #所属商圈
    business_circle = ps[2].label.string
    dict["business_circle"] =business_circle
    #开发商
    developers = ps[2].em.string
    dict["developers"] = developers

    soup2 = soup.find("ul","home_details")
    for tag_li in soup2.find_all("li"):
        if tag_li["class"] == [li_left]:
            p = tag_li.find_all("p")
            #建筑面积
            area=p[0].em.string
            dict["area"] = area
            #物业公司
            property_company=p[1].em.string
            dict["property_company"] = property_company
            #物业费
            industry_fee = p[2].em.string
            dict["industry_fee"] = industry_fee

        elif tag_li["class"] == [li_center]:
            p = tag_li.find_all("p")
            #建成年代
            built_year = p[0].em.string
            dict["built_year"] = built_year
            #房屋总数
            total_houses = p[1].em.string
            dict["total_houses"] = total_houses
            #绿化率
            green_rates = p[2].em.string
            dict["green_rates"] =green_rates

        elif tag_li["class"] == [li_right]:
            p = tag_li.find_all("p")
            # 占地面积
            cover_area = p[0].em.string
            dict["cover_area"] = cover_area
            # 楼栋总数
            total_built = p[1].em.string
            dict["total_built"] = total_built
            # 容积率
            product_rates = p[2].em.string
            dict["product_rates"] = product_rates
    return dict



if __name__ == __main__:
    data_all = get_communities_url()
    print(data_all)

 

麦田厦门下区信息数据爬取

标签:any   建筑   replace   epo   charset   mmu   safari   eve   unity   

原文地址:https://www.cnblogs.com/venvive/p/11415472.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!