码迷,mamicode.com
首页 > 其他好文 > 详细

今日头条街拍图片爬取

时间:2018-08-17 00:44:09      阅读:140      评论:0      收藏:0      [点我收藏+]

标签:safari   ror   析构   操作   int   mozilla   def   修改   amp   

import re
import requests
import os
from urllib import request
import json
from mysql_tu import mysql_conn


headers = {
    user-agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36
}
for v in range(0,60,20):
    url = https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab.format(v)

    response = requests.get(url,headers=headers)
    html_json_dict = response.json()

    # 创建文件
    if not os.path.exists(cccc):
        os.mkdir(cccc)

    data_list = html_json_dict[data]
    # print(data_list)
    for data_item in data_list:
        if article_url in data_item:
            article_url = data_item[article_url]
            # print(article_url)
    #
            response = requests.get(article_url,headers=headers)
            html_ee = response.text
            # print(html_ee)
            # html_ee = json.loads(html_str)

            # print(type(html_str))
            pp = rgallery: JSON\.parse\((.*)\),
            match_res = re.search(pp, html_ee)
            # print(match_res.group(1))
            if match_res:
                match_str = match_res.group(1)
                match_dict = json.loads(match_str)
                # print(match_dict)
                # print(type(match_dict))
                match_dict = json.loads(match_dict)
                # print(match_dict)
                # print(type(match_dict))
                image_dict = match_dict[sub_images]
                # print(image_dict)
                for v in image_dict:
                    image_aa = v[url]
                    print(image_aa)
                    try:
                        # filename = ‘cccc/‘ + image_aa.split(‘/‘)[-1] + ‘.jpg‘
                        filename = image_aa.split(/)[-1] + .jpg

                        # 下载图片
                        # request.urlretrieve(image_aa, filename)
                        ver = {}
                        ver[filename] = filename
                        sql = insert into jiepai(filename) values("{filename}").format(**ver)
                        mc = mysql_conn()
                        mc.execute_modify_mysql(sql)
                    except TimeoutError:
                        print(下载超时)
                        continue
            else:
                print(没有那个文件)
                continue
#文件名 mysql_tu.py

import pymysql

class mysql_conn(object):
    # 魔术方法, 初始化, 构造函数
    def __init__(self):
        self.db = pymysql.connect(host=127.0.0.1, user=root, password=lxh1122, port=3306, database=py11)
        self.cursor = self.db.cursor()

    # 执行modify(修改)相关的操作
    def execute_modify_mysql(self, sql):
        self.cursor.execute(sql)
        self.db.commit()

    # 魔术方法, 析构化 ,析构函数
    def __del__(self):
        self.cursor.close()
        self.db.close()

if __name__==__main__:
    sql = insert into jiepai values ()
    mc = mysql_conn()
    mc.execute_modify_mysql(sql)
    sql = insert into jiepai values ()

    mc.execute_modify_mysql(sql)
    sql = insert into jiepai values ()

    mc.execute_modify_mysql(sql)
    sql = insert into jiepai values ()

    mc.execute_modify_mysql(sql)

 

今日头条街拍图片爬取

标签:safari   ror   析构   操作   int   mozilla   def   修改   amp   

原文地址:https://www.cnblogs.com/lxh777/p/9490895.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!