爬取今日头条

时间：2018-08-17 00:40:09 阅读：233 评论：0 收藏：0 [点我收藏+]

标签：get ber user path windows amp for tle win

import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
    headers = {
        ‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
    }
    # url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
    response = requests.get(url,headers=headers)
    s = response.text

    match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
    match_count = json.loads(match_res.group(1))
    # print((match_count))
    result = json.loads(match_count)
    print(type(result))

    if not os.path.exists(‘download/‘+title):
        os.makedirs(‘download/‘+title)

    for image_ in result[‘sub_images‘]:
        image_url = image_[‘url‘]
        fname = image_url.split(‘/‘)[-1]
        request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)

    print((result))

def get_url(offset=0):
    url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
    full_url = url.format(offset)
    response = requests.get(full_url)

    res_json = response.json()
    # print(res_json)
    # print(type(res_json))
    # 循环获得四页图片
    number = offset/20
    if number <= 4:
        number += 1
        offset = 20*(number)
        for page in res_json[‘data‘]:
            if ‘article_url‘ in page:
                article_url = page[‘article_url‘]
                title = page[‘title‘]
                get_detail(article_url,title)
        get_url(offset)

if __name__ == ‘__main__‘:
    # 从第一页开始获取数据
    get_url(0)import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
    headers = {
        ‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
    }
    # url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
    response = requests.get(url,headers=headers)
    s = response.text

    match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
    match_count = json.loads(match_res.group(1))
    # print((match_count))
    result = json.loads(match_count)
    print(type(result))

    if not os.path.exists(‘download/‘+title):
        os.makedirs(‘download/‘+title)

    for image_ in result[‘sub_images‘]:
        image_url = image_[‘url‘]
        fname = image_url.split(‘/‘)[-1]
        request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)

    print((result))

def get_url(offset=0):
    url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
    full_url = url.format(offset)
    response = requests.get(full_url)

    res_json = response.json()
    # print(res_json)
    # print(type(res_json))
    # 循环获得四页图片
    number = offset/20
    if number <= 4:
        number += 1
        offset = 20*(number)
        for page in res_json[‘data‘]:
            if ‘article_url‘ in page:
                article_url = page[‘article_url‘]
                title = page[‘title‘]
                get_detail(article_url,title)
        get_url(offset)

if __name__ == ‘__main__‘:
    # 从第一页开始获取数据
    get_url(0)

爬取今日头条

标签：get ber user path windows amp for tle win

原文地址：https://www.cnblogs.com/luwanhe/p/9490785.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行