码迷,mamicode.com
首页 > 其他好文 > 详细

爬取大众点评

时间:2020-04-20 17:33:50      阅读:73      评论:0      收藏:0      [点我收藏+]

标签:doc   set   time   content   offset   get   lis   tle   encoding   

clear_data.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from docx import Document
from docx.shared import Inches,Pt
from docx.oxml.ns import qn
key_word =[小孩,儿童,儿子,女儿,小孩子,,嬉戏,亲子,玩具,宝宝,宝贝]
childer =[]
with open(code_dict.txt,r,encoding=utf-8)as f:
    content = f.readlines()
    for one_line in content:
        dict = eval(one_line)

        for w in key_word:
            if w in dict[comment]:
                childer.append(dict)
                break

print(len(childer))
print((childer[1:10]))


doc =Document()

for i in childer:
    print(i)
    name = i.get(name)
    comment = i.get(comment)
    time = i.get(time)
    star = i.get(star)
    pic = i.get(pic)

    pen = doc.add_paragraph()
    ph = pen.paragraph_format
    ph.line_spacing = Pt(22)
    pensize1 = pen.add_run(用户:+name+\n)
    pensize = pen.add_run(评分:+str(star)+\n)
    pensize2 = pen.add_run(时间:+time+\n)
    pensize3 = pen.add_run(评论:+comment+\n)

    pensize.font.name = 宋体
    pensize._element.rPr.rFonts.set(qn(w:eastAsia), 宋体)
    pensize.font.size = Pt(15)
    pensize.bold=True

    pensize1.font.name = 宋体
    pensize1._element.rPr.rFonts.set(qn(w:eastAsia), 宋体)
    pensize1.font.size = Pt(15)

    pensize2.font.name = 宋体
    pensize2._element.rPr.rFonts.set(qn(w:eastAsia), 宋体)
    pensize2.font.size = Pt(15)

    pensize3.font.name = 宋体
    pensize3._element.rPr.rFonts.set(qn(w:eastAsia), 宋体)
    pensize3.font.size = Pt(15)
    if pic:
        for p in pic:
            req = requests.get(p)
            with open(capth.png,wb)as f:
                f.write(req.content)

            doc.add_picture(capth.png, width=Inches(2.5))

doc.save(dianping.docx)

dazhong.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import datetime
import random
import time
import re

# from selenium.webdriver.chrome.options import Options
# from selenium import webdriver
from lxml import etree
import requests


class DianpingComment:
    font_size = 14
    start_y = 23

    def __init__(self, shop_id, cookies, delay=7, handle_ban=False):
        self.shop_id = shop_id
        self._delay = delay
        self._cookies = self._format_cookies(cookies)
        self._css_headers = {
            User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36,
        }
        self._default_headers = {
            Connection: keep-alive,
            Host: www.dianping.com,
            User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36,
        }
        self._cur_request_url = http://www.dianping.com/shop/{}/review_all/p1.format(shop_id)
        if handle_ban:
            print(不想写跳过验证了)
            # self._browser = self._init_browser()
            # self._handle_ban()

    def run(self):
        self._css_link = self._get_css_link(self._cur_request_url)
        self._font_dict = self._get_font_dict(self._css_link)
        self._get_conment_page()

    def _delay_func(self):
        delay_time = random.randint((self._delay - 2) * 10, (self._delay + 2) * 10) * 0.1
        print(睡一会,delay_time)
        time.sleep(delay_time)

    # def _init_browser(self):
    #     """
    #         初始化游览器
    #     """
    #     chrome_options = Options()
    #     chrome_options.add_argument(‘--headless‘)
    #     chrome_options.add_argument(‘--disable-gpu‘)
    #     browser = webdriver.Chrome(chrome_options=chrome_options)
    #     browser.get(self._cur_request_url)
    #     for name, value in self._cookies.items():
    #         browser.add_cookie({‘name‘: name, ‘value‘: value})
    #     browser.refresh()
    #     return browser

    # def _handle_ban(self):
    #     """
    #         爬取速度过快,出现异常时处理验证
    #     """
    #     try:
    #         self._browser.refresh()
    #         time.sleep(1)
    #         button = self._browser.find_element_by_id(‘yodaBox‘)
    #         move_x_offset = self._browser.find_element_by_id(‘yodaBoxWrapper‘).size[‘width‘]
    #         webdriver.ActionChains(self._browser).drag_and_drop_by_offset(
    #             button, move_x_offset, 0).perform()
    #     except:
    #         pass

    def _format_cookies(self, cookies):
        cookies = {cookie.split(=)[0]: cookie.split(=)[1]
                   for cookie in cookies.replace( , ‘‘).split(;)}

        return cookies

    def _get_conment_page(self):   # 获得评论内容
        """
            请求评论页,并将<span></span>样式替换成文字
        """
        while self._cur_request_url:
            self._delay_func()
            print([{now_time}] {msg}.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))
            res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)
            html = res.text
            class_set = set()
            for span in re.findall(r<span class="([a-zA-Z0-9]{5,6})"></span>, html):
                class_set.add(span)

            for class_name in class_set:
                html = re.sub(<span class="%s"></span> % class_name, self._font_dict[class_name], html)

            doc = etree.HTML(html)
            self._parse_comment_page(doc)

            try:
                self._default_headers[Referer] = self._cur_request_url
                next_page_url = http://www.dianping.com + doc.xpath(.//a[@class="NextPage"]/@href)[0]
            except IndexError:
                next_page_url = None
            self._cur_request_url = next_page_url

    def _data_pipeline(self, data):
        """
            处理数据
        """
        print(data)

    def _parse_comment_page(self, doc):
        """
            解析评论页并提取数据
        """
        for li in doc.xpath(//*[@class="reviews-items"]/ul/li):

            name = li.xpath(.//a[@class="name"]/text())[0].strip(\n\r \t)
            try:
                star = li.xpath(.//span[contains(./@class, "sml-str")]/@class)[0]
                star = re.findall(rsml-rank-stars sml-str(.*?) star, star)[0]
            except IndexError:
                star = 0
            time = li.xpath(.//span[@class="time"]/text())[0].strip(\n\r \t)
            pics =[]

            if li.xpath(.//*[@class="review-pictures"]/ul/li):
                for pic in li.xpath(.//*[@class="review-pictures"]/ul/li):
                    print(pic.xpath(.//a/@href))
                    pics.append(pic.xpath(.//a/img/@data-big)[0])
            comment = ‘‘.join(li.xpath(.//div[@class="review-words Hide"]/text())).strip(\n\r \t)
            if not comment:
                comment = ‘‘.join(li.xpath(.//div[@class="review-words"]/text())).strip(\n\r \t)

            data = {
                name: name,
                comment: comment,
                star: star,
                pic:pics,
                time: time,
            }
            self._data_pipeline(data)
    def _get_css_link(self, url):
        """
            请求评论首页,获取css样式文件
        """
        res = requests.get(url, headers=self._default_headers, cookies=self._cookies)
        html = res.text
        # print(html)
        # css_link = re.search(r‘<link re.*?css.*?href="(.*?svgtextcss.*?)">‘, html)
        css_link = re.findall(r<link rel="stylesheet" type="text/css" href="//s3plus.meituan.net/v1/(.*?)">, html)

        assert css_link
        css_link = http://s3plus.meituan.net/v1/ + css_link[0]
        return css_link

    def _get_font_dict(self, url):
        """
            获取css样式对应文字的字典
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text

        background_image_link = re.findall(rbackground-image: url\((.*?)\);, html)
        print(带有svg的链接,background_image_link)
        assert background_image_link
        background_image_link = http: + background_image_link[1]
        html = re.sub(rspan.*?\}, ‘‘, html)
        group_offset_list = re.findall(r\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;, html)  # css中的类
        print(css中class对应坐标,group_offset_list)
        font_dict_by_offset = self._get_font_dict_by_offset(background_image_link) # svg得到这里面图片对应成字典
        print(解析svg成字典,font_dict_by_offset)

        font_dict = {}

        for class_name, x_offset, y_offset in group_offset_list:
            y_offset = y_offset.replace(.0, ‘‘)
            x_offset = x_offset.replace(.0, ‘‘)
            # print(y_offset,x_offset)
            if font_dict_by_offset.get(int(y_offset)):
                font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]

        return font_dict

    def _get_font_dict_by_offset(self, url):
        """
            获取坐标偏移的文字字典, 会有最少两种形式的svg文件(目前只遇到两种)
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text
        font_dict = {}
        # print(html)
        y_list = re.findall(rd="M0 (\d+?) , html)

        if y_list:
            font_list = re.findall(r<textPath .*?>(.*?)<, html)
            for i, string in enumerate(font_list):
                y_offset = self.start_y - int(y_list[i])

                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font

                font_dict[y_offset] = sub_font_dict

        else:
            font_list = re.findall(r<text.*?y="(.*?)">(.*?)<, html)

            for y, string in font_list:
                y_offset = self.start_y - int(y)
                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font

                font_dict[y_offset] = sub_font_dict
        return font_dict


if __name__ == "__main__":
    pass

demo.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-


from dazhong import DianpingComment

COOKIES = _lxsdk_cuid=1699b152d90c8-04b0ee8b481697-541f3415-1fa400-1699b152d91c8; _lxsdk=1699b152d90c8-04b0ee8b481697-541f3415-1fa400-1699b152d91c8; _hc.v=992d8c67-a9b0-ee61-c6cf-ed9b42cfe11f.1553085051; _thirdu.c=136cbfec8b174105c45f6628ce431df6; ctu=cc29f77c02b4556c6a1db1c67c5c10e084f7f63d00208c59788c11a4845348aa; cy=160; cye=zhengzhou; thirdtoken=e0dfd5bf-3cc9-482c-a559-ecb5a5408581; dper=13f0e16d38f4829e80270687b88c4ce8d36d333a6f525bc6be3dec9bbc60b1d7f44f8b47a413dc1c18f3ef5fed921594f3c5161e72d50fed52f3006625babe559507c56bb8b77d1f9dd95d104ffb3cdba1c49805e34df17c99e3ba781183b850; ll=7fd06e815b796be3df069dec7836c3df; ua=aJay13; ctu=a5f067d1428ce75e417e53634b352a7767a63503c85b2d59c0c70ae996add3e701d656899061b0eddfa568430b723553; _lxsdk_s=1699df6ef73-4f6-781-d9c%7C%7C719


class Customer(DianpingComment):

    def _data_pipeline(self, data):
        print(data)
        with open(code_dict.txt,a+,encoding=utf-8)as f:
            f.write(str(data)+\n)

if __name__ == "__main__":
    dianping = Customer(1726435, cookies=COOKIES)
    dianping.run()

 

 

爬取大众点评

标签:doc   set   time   content   offset   get   lis   tle   encoding   

原文地址:https://www.cnblogs.com/-stewart/p/12739173.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!