码迷,mamicode.com
首页 > 其他好文 > 详细

爬取今日头条

时间:2018-08-17 00:40:09      阅读:233      评论:0      收藏:0      [点我收藏+]

标签:get   ber   user   path   windows   amp   for   tle   win   

import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
    headers = {
        ‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
    }
    # url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
    response = requests.get(url,headers=headers)
    s = response.text

    match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
    match_count = json.loads(match_res.group(1))
    # print((match_count))
    result = json.loads(match_count)
    print(type(result))

    if not os.path.exists(‘download/‘+title):
        os.makedirs(‘download/‘+title)




    for image_ in result[‘sub_images‘]:
        image_url = image_[‘url‘]
        fname = image_url.split(‘/‘)[-1]
        request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)


    print((result))

def get_url(offset=0):
    url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
    full_url = url.format(offset)
    response = requests.get(full_url)

    res_json = response.json()
    # print(res_json)
    # print(type(res_json))
    # 循环获得四页图片
    number = offset/20
    if number <= 4:
        number += 1
        offset = 20*(number)
        for page in res_json[‘data‘]:
            if  ‘article_url‘ in page:
                article_url = page[‘article_url‘]
                title = page[‘title‘]
                get_detail(article_url,title)
        get_url(offset)

if __name__ == ‘__main__‘:
    # 从第一页开始获取数据
    get_url(0)import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
    headers = {
        ‘User-Agent‘:‘Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36‘
    }
    # url = ‘https://www.toutiao.com/a6589905154147877384/#p=3‘
    response = requests.get(url,headers=headers)
    s = response.text

    match_res = re.search(r‘gallery: JSON.parse\((.+?)\)‘,s)
    match_count = json.loads(match_res.group(1))
    # print((match_count))
    result = json.loads(match_count)
    print(type(result))

    if not os.path.exists(‘download/‘+title):
        os.makedirs(‘download/‘+title)




    for image_ in result[‘sub_images‘]:
        image_url = image_[‘url‘]
        fname = image_url.split(‘/‘)[-1]
        request.urlretrieve(image_url,‘download/‘+title+‘/‘+fname+‘.jpg‘)


    print((result))

def get_url(offset=0):
    url = ‘https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab‘
    full_url = url.format(offset)
    response = requests.get(full_url)

    res_json = response.json()
    # print(res_json)
    # print(type(res_json))
    # 循环获得四页图片
    number = offset/20
    if number <= 4:
        number += 1
        offset = 20*(number)
        for page in res_json[‘data‘]:
            if  ‘article_url‘ in page:
                article_url = page[‘article_url‘]
                title = page[‘title‘]
                get_detail(article_url,title)
        get_url(offset)

if __name__ == ‘__main__‘:
    # 从第一页开始获取数据
    get_url(0)

爬取今日头条

标签:get   ber   user   path   windows   amp   for   tle   win   

原文地址:https://www.cnblogs.com/luwanhe/p/9490785.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!