今日头条街拍图片爬取

时间：2018-08-17 00:44:09 阅读：140 评论：0 收藏：0 [点我收藏+]

标签：safari ror 析构操作 int mozilla def 修改 amp

import re
import requests
import os
from urllib import request
import json
from mysql_tu import mysql_conn


headers = {
    ‘user-agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36‘
}
for v in range(0,60,20):
    url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘.format(v)

    response = requests.get(url,headers=headers)
    html_json_dict = response.json()

    # 创建文件
    if not os.path.exists(‘cccc‘):
        os.mkdir(‘cccc‘)

    data_list = html_json_dict[‘data‘]
    # print(data_list)
    for data_item in data_list:
        if ‘article_url‘ in data_item:
            article_url = data_item[‘article_url‘]
            # print(article_url)
    #
            response = requests.get(article_url,headers=headers)
            html_ee = response.text
            # print(html_ee)
            # html_ee = json.loads(html_str)

            # print(type(html_str))
            pp = r‘gallery: JSON\.parse\((.*)\),‘
            match_res = re.search(pp, html_ee)
            # print(match_res.group(1))
            if match_res:
                match_str = match_res.group(1)
                match_dict = json.loads(match_str)
                # print(match_dict)
                # print(type(match_dict))
                match_dict = json.loads(match_dict)
                # print(match_dict)
                # print(type(match_dict))
                image_dict = match_dict[‘sub_images‘]
                # print(image_dict)
                for v in image_dict:
                    image_aa = v[‘url‘]
                    print(image_aa)
                    try:
                        # filename = ‘cccc/‘ + image_aa.split(‘/‘)[-1] + ‘.jpg‘
                        filename = image_aa.split(‘/‘)[-1] + ‘.jpg‘

                        # 下载图片
                        # request.urlretrieve(image_aa, filename)
                        ver = {}
                        ver[‘filename‘] = filename
                        sql = ‘insert into jiepai(filename) values("{filename}")‘.format(**ver)
                        mc = mysql_conn()
                        mc.execute_modify_mysql(sql)
                    except TimeoutError:
                        print(‘下载超时‘)
                        continue
            else:
                print(‘没有那个文件‘)
                continue

#文件名 mysql_tu.py

import pymysql

class mysql_conn(object):
    # 魔术方法, 初始化, 构造函数
    def __init__(self):
        self.db = pymysql.connect(host=‘127.0.0.1‘, user=‘root‘, password=‘lxh1122‘, port=3306, database=‘py11‘)
        self.cursor = self.db.cursor()

    # 执行modify(修改)相关的操作
    def execute_modify_mysql(self, sql):
        self.cursor.execute(sql)
        self.db.commit()

    # 魔术方法, 析构化 ,析构函数
    def __del__(self):
        self.cursor.close()
        self.db.close()

if __name__==‘__main__‘:
    sql = ‘insert into jiepai values ()‘
    mc = mysql_conn()
    mc.execute_modify_mysql(sql)
    sql = ‘insert into jiepai values ()‘

    mc.execute_modify_mysql(sql)
    sql = ‘insert into jiepai values ()‘

    mc.execute_modify_mysql(sql)
    sql = ‘insert into jiepai values ()‘

    mc.execute_modify_mysql(sql)

今日头条街拍图片爬取

标签：safari ror 析构操作 int mozilla def 修改 amp

原文地址：https://www.cnblogs.com/lxh777/p/9490895.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行