爬取大众点评

时间：2020-04-20 17:33:50 阅读：73 评论：0 收藏：0 [点我收藏+]

标签：doc set time content offset get lis tle encoding

clear_data.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from docx import Document
from docx.shared import Inches,Pt
from docx.oxml.ns import qn
key_word =[‘小孩‘,‘儿童‘,‘儿子‘,‘女儿‘,‘小孩子‘,‘娃‘,‘嬉戏‘,‘亲子‘,‘玩具‘,‘宝宝‘,‘宝贝‘]
childer =[]
with open(‘code_dict.txt‘,‘r‘,encoding=‘utf-8‘)as f:
    content = f.readlines()
    for one_line in content:
        dict = eval(one_line)

        for w in key_word:
            if w in dict[‘comment‘]:
                childer.append(dict)
                break

print(len(childer))
print((childer[1:10]))


doc =Document()

for i in childer:
    print(i)
    name = i.get(‘name‘)
    comment = i.get(‘comment‘)
    time = i.get(‘time‘)
    star = i.get(‘star‘)
    pic = i.get(‘pic‘)

    pen = doc.add_paragraph()
    ph = pen.paragraph_format
    ph.line_spacing = Pt(22)
    pensize1 = pen.add_run(‘用户：‘+name+‘\n‘)
    pensize = pen.add_run(‘评分：‘+str(star)+‘\n‘)
    pensize2 = pen.add_run(‘时间：‘+time+‘\n‘)
    pensize3 = pen.add_run(‘评论：‘+comment+‘\n‘)

    pensize.font.name = ‘宋体‘
    pensize._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘)
    pensize.font.size = Pt(15)
    pensize.bold=True

    pensize1.font.name = ‘宋体‘
    pensize1._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘)
    pensize1.font.size = Pt(15)

    pensize2.font.name = ‘宋体‘
    pensize2._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘)
    pensize2.font.size = Pt(15)

    pensize3.font.name = ‘宋体‘
    pensize3._element.rPr.rFonts.set(qn(‘w:eastAsia‘), ‘宋体‘)
    pensize3.font.size = Pt(15)
    if pic:
        for p in pic:
            req = requests.get(p)
            with open(‘capth.png‘,‘wb‘)as f:
                f.write(req.content)

            doc.add_picture(‘capth.png‘, width=Inches(2.5))

doc.save(‘dianping.docx‘)

dazhong.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import datetime
import random
import time
import re

# from selenium.webdriver.chrome.options import Options
# from selenium import webdriver
from lxml import etree
import requests


class DianpingComment:
    font_size = 14
    start_y = 23

    def __init__(self, shop_id, cookies, delay=7, handle_ban=False):
        self.shop_id = shop_id
        self._delay = delay
        self._cookies = self._format_cookies(cookies)
        self._css_headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36‘,
        }
        self._default_headers = {
            ‘Connection‘: ‘keep-alive‘,
            ‘Host‘: ‘www.dianping.com‘,
            ‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36‘,
        }
        self._cur_request_url = ‘http://www.dianping.com/shop/{}/review_all/p1‘.format(shop_id)
        if handle_ban:
            print(‘不想写跳过验证了‘)
            # self._browser = self._init_browser()
            # self._handle_ban()

    def run(self):
        self._css_link = self._get_css_link(self._cur_request_url)
        self._font_dict = self._get_font_dict(self._css_link)
        self._get_conment_page()

    def _delay_func(self):
        delay_time = random.randint((self._delay - 2) * 10, (self._delay + 2) * 10) * 0.1
        print(‘睡一会‘,delay_time)
        time.sleep(delay_time)

    # def _init_browser(self):
    #     """
    #         初始化游览器
    #     """
    #     chrome_options = Options()
    #     chrome_options.add_argument(‘--headless‘)
    #     chrome_options.add_argument(‘--disable-gpu‘)
    #     browser = webdriver.Chrome(chrome_options=chrome_options)
    #     browser.get(self._cur_request_url)
    #     for name, value in self._cookies.items():
    #         browser.add_cookie({‘name‘: name, ‘value‘: value})
    #     browser.refresh()
    #     return browser

    # def _handle_ban(self):
    #     """
    #         爬取速度过快，出现异常时处理验证
    #     """
    #     try:
    #         self._browser.refresh()
    #         time.sleep(1)
    #         button = self._browser.find_element_by_id(‘yodaBox‘)
    #         move_x_offset = self._browser.find_element_by_id(‘yodaBoxWrapper‘).size[‘width‘]
    #         webdriver.ActionChains(self._browser).drag_and_drop_by_offset(
    #             button, move_x_offset, 0).perform()
    #     except:
    #         pass

    def _format_cookies(self, cookies):
        cookies = {cookie.split(‘=‘)[0]: cookie.split(‘=‘)[1]
                   for cookie in cookies.replace(‘ ‘, ‘‘).split(‘;‘)}

        return cookies

    def _get_conment_page(self):   # 获得评论内容
        """
            请求评论页，并将<span></span>样式替换成文字
        """
        while self._cur_request_url:
            self._delay_func()
            print(‘[{now_time}] {msg}‘.format(now_time=datetime.datetime.now(), msg=self._cur_request_url))
            res = requests.get(self._cur_request_url, headers=self._default_headers, cookies=self._cookies)
            html = res.text
            class_set = set()
            for span in re.findall(r‘<span class="([a-zA-Z0-9]{5,6})"></span>‘, html):
                class_set.add(span)

            for class_name in class_set:
                html = re.sub(‘<span class="%s"></span>‘ % class_name, self._font_dict[class_name], html)

            doc = etree.HTML(html)
            self._parse_comment_page(doc)

            try:
                self._default_headers[‘Referer‘] = self._cur_request_url
                next_page_url = ‘http://www.dianping.com‘ + doc.xpath(‘.//a[@class="NextPage"]/@href‘)[0]
            except IndexError:
                next_page_url = None
            self._cur_request_url = next_page_url

    def _data_pipeline(self, data):
        """
            处理数据
        """
        print(data)

    def _parse_comment_page(self, doc):
        """
            解析评论页并提取数据
        """
        for li in doc.xpath(‘//*[@class="reviews-items"]/ul/li‘):

            name = li.xpath(‘.//a[@class="name"]/text()‘)[0].strip(‘\n\r \t‘)
            try:
                star = li.xpath(‘.//span[contains(./@class, "sml-str")]/@class‘)[0]
                star = re.findall(r‘sml-rank-stars sml-str(.*?) star‘, star)[0]
            except IndexError:
                star = 0
            time = li.xpath(‘.//span[@class="time"]/text()‘)[0].strip(‘\n\r \t‘)
            pics =[]

            if li.xpath(‘.//*[@class="review-pictures"]/ul/li‘):
                for pic in li.xpath(‘.//*[@class="review-pictures"]/ul/li‘):
                    print(pic.xpath(‘.//a/@href‘))
                    pics.append(pic.xpath(‘.//a/img/@data-big‘)[0])
            comment = ‘‘.join(li.xpath(‘.//div[@class="review-words Hide"]/text()‘)).strip(‘\n\r \t‘)
            if not comment:
                comment = ‘‘.join(li.xpath(‘.//div[@class="review-words"]/text()‘)).strip(‘\n\r \t‘)

            data = {
                ‘name‘: name,
                ‘comment‘: comment,
                ‘star‘: star,
                ‘pic‘:pics,
                ‘time‘: time,
            }
            self._data_pipeline(data)
    def _get_css_link(self, url):
        """
            请求评论首页，获取css样式文件
        """
        res = requests.get(url, headers=self._default_headers, cookies=self._cookies)
        html = res.text
        # print(html)
        # css_link = re.search(r‘<link re.*?css.*?href="(.*?svgtextcss.*?)">‘, html)
        css_link = re.findall(r‘<link rel="stylesheet" type="text/css" href="//s3plus.meituan.net/v1/(.*?)">‘, html)

        assert css_link
        css_link = ‘http://s3plus.meituan.net/v1/‘ + css_link[0]
        return css_link

    def _get_font_dict(self, url):
        """
            获取css样式对应文字的字典
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text

        background_image_link = re.findall(r‘background-image: url\((.*?)\);‘, html)
        print(‘带有svg的链接‘,background_image_link)
        assert background_image_link
        background_image_link = ‘http:‘ + background_image_link[1]
        html = re.sub(r‘span.*?\}‘, ‘‘, html)
        group_offset_list = re.findall(r‘\.([a-zA-Z0-9]{5,6}).*?round:(.*?)px (.*?)px;‘, html)  # css中的类
        print(‘css中class对应坐标‘,group_offset_list)
        font_dict_by_offset = self._get_font_dict_by_offset(background_image_link) # svg得到这里面图片对应成字典
        print(‘解析svg成字典‘,font_dict_by_offset)

        font_dict = {}

        for class_name, x_offset, y_offset in group_offset_list:
            y_offset = y_offset.replace(‘.0‘, ‘‘)
            x_offset = x_offset.replace(‘.0‘, ‘‘)
            # print(y_offset,x_offset)
            if font_dict_by_offset.get(int(y_offset)):
                font_dict[class_name] = font_dict_by_offset[int(y_offset)][int(x_offset)]

        return font_dict

    def _get_font_dict_by_offset(self, url):
        """
            获取坐标偏移的文字字典, 会有最少两种形式的svg文件（目前只遇到两种）
        """
        res = requests.get(url, headers=self._css_headers)
        html = res.text
        font_dict = {}
        # print(html)
        y_list = re.findall(r‘d="M0 (\d+?) ‘, html)

        if y_list:
            font_list = re.findall(r‘<textPath .*?>(.*?)<‘, html)
            for i, string in enumerate(font_list):
                y_offset = self.start_y - int(y_list[i])

                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font

                font_dict[y_offset] = sub_font_dict

        else:
            font_list = re.findall(r‘<text.*?y="(.*?)">(.*?)<‘, html)

            for y, string in font_list:
                y_offset = self.start_y - int(y)
                sub_font_dict = {}
                for j, font in enumerate(string):
                    x_offset = -j * self.font_size
                    sub_font_dict[x_offset] = font

                font_dict[y_offset] = sub_font_dict
        return font_dict


if __name__ == "__main__":
    pass

demo.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-


from dazhong import DianpingComment

COOKIES = ‘_lxsdk_cuid=1699b152d90c8-04b0ee8b481697-541f3415-1fa400-1699b152d91c8; _lxsdk=1699b152d90c8-04b0ee8b481697-541f3415-1fa400-1699b152d91c8; _hc.v=992d8c67-a9b0-ee61-c6cf-ed9b42cfe11f.1553085051; _thirdu.c=136cbfec8b174105c45f6628ce431df6; ctu=cc29f77c02b4556c6a1db1c67c5c10e084f7f63d00208c59788c11a4845348aa; cy=160; cye=zhengzhou; thirdtoken=e0dfd5bf-3cc9-482c-a559-ecb5a5408581; dper=13f0e16d38f4829e80270687b88c4ce8d36d333a6f525bc6be3dec9bbc60b1d7f44f8b47a413dc1c18f3ef5fed921594f3c5161e72d50fed52f3006625babe559507c56bb8b77d1f9dd95d104ffb3cdba1c49805e34df17c99e3ba781183b850; ll=7fd06e815b796be3df069dec7836c3df; ua=aJay13; ctu=a5f067d1428ce75e417e53634b352a7767a63503c85b2d59c0c70ae996add3e701d656899061b0eddfa568430b723553; _lxsdk_s=1699df6ef73-4f6-781-d9c%7C%7C719‘


class Customer(DianpingComment):

    def _data_pipeline(self, data):
        print(data)
        with open(‘code_dict.txt‘,‘a+‘,encoding=‘utf-8‘)as f:
            f.write(str(data)+‘\n‘)

if __name__ == "__main__":
    dianping = Customer(‘1726435‘, cookies=COOKIES)
    dianping.run()

爬取大众点评

标签：doc set time content offset get lis tle encoding

原文地址：https://www.cnblogs.com/-stewart/p/12739173.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行