码迷,mamicode.com
首页 > 编程语言 > 详细

python 百度图片爬虫

时间:2018-05-22 22:27:19      阅读:377      评论:0      收藏:0      [点我收藏+]

标签:百度   exist   requests   download   down   .net   rm -rf   format   os.path   

 

# -*- coding:utf-8 -*-
#https://blog.csdn.net/qq_32166627/article/details/60882964
import requests
import os
import pinyin

def getManyPages(keyword,pages):
    params=[]
    for i in range(30,30*pages+30,30):
        params.append({
                      tn: resultjson_com,
                      ipn: rj,
                      ct: 201326592,
                      is: ‘‘,
                      fp: result,
                      queryWord: keyword,
                      cl: 2,
                      lm: -1,
                      ie: utf-8,
                      oe: utf-8,
                      adpicid: ‘‘,
                      st: -1,
                      z: ‘‘,
                      ic: 0,
                      word: keyword,
                      s: ‘‘,
                      se: ‘‘,
                      tab: ‘‘,
                      width: ‘‘,
                      height: ‘‘,
                      face: 0,
                      istype: 2,
                      qc: ‘‘,
                      nc: 1,
                      fr: ‘‘,
                      pn: i,
                      rn: 30,
                      gsm: 1e,
                      1488942260214: ‘‘
                  })
    url = https://image.baidu.com/search/acjson
    urls = []
    for i in params:
        urls.append(requests.get(url,params=i).json().get(data))

    return urls


def getImg(dataList, localPath, keyword):

    if not os.path.exists(localPath):  # 新建文件夹
        os.mkdir(localPath)

    x = 0
    for list in dataList:
        for i in list:
            if i.get(thumbURL) != None:
                #print(‘download:%s‘ % i.get(‘thumbURL‘))
                print("down "  + keyword + str(x) + " image " + i.get(thumbURL))
                ir = requests.get(i.get(thumbURL))
                open(localPath +"/" + keyword +  _%d.jpg % x, wb).write(ir.content)
                x += 1
            else:
                print(image not exist)

# if __name__ == ‘__main__‘:

#     with open("stars_list_clean.txt",‘r‘) as face_file:
#       stars_list = face_file.readlines()
#       index = 0
#       for line in stars_list:
#           line = line.replace(‘\r‘,‘‘).replace(‘\n‘,‘‘).replace(‘\t‘,‘‘)
#           keyword_english = pinyin.get(line, format="strip")
#           keyword = line
#           index += 1
#           if index > 0:
#             break

#     # print(keyword)
#     # keyword1 = ‘胡因梦‘
#     # if keyword == keyword1:
#     #     print("yes")
#     # else:
#     #     print("no")
#     #keyword = ‘胡因梦‘
#     #keyword = keyword.replace(‘\X‘,‘‘)
#     dataList = getManyPages(keyword,2)  # 参数1:关键字,参数2:要下载的页数
#     getImg(dataList,‘./hanxue‘, keyword_english) # 参数2:指定保存的路径

    # keyword = ‘韩雪‘
    # dataList = getManyPages(keyword,10)  # 参数1:关键字,参数2:要下载的页数
    # getImg(dataList,‘./hanxue‘) # 参数2:指定保存的路径

if __name__ == __main__:

    #convert()

    #word = input("Input key word: ")
    # print pinyin.get(‘你好‘)#声调
    # print pinyin.get((‘你好‘), format="strip")#无声调
    #stars_list = ["范冰冰", "刘德华","周迅","乔丹"]
    #en = []
    # fp = open("stars_list_en.txt",‘w‘)
    # with open("stars_list.txt",‘r‘) as face_file:
    #     stars_list = face_file.readlines()
    #     for line in stars_list:
    #         print(line[0:-1]) 
    #         keyword_english = pinyin.get(line[0:-1], format="strip")
    #         print(keyword_english)
    #         en.append(keyword_english)
    #         fp.write(‘%s\n‘ % keyword_english.encode(‘utf-8‘))
    # print(en)

    fp = open("stars_list_en.txt",w)
    with open("stars_list_clean.txt",r) as face_file:
        stars_list = face_file.readlines()
        for line in stars_list:
            line = line.replace(\r,‘‘).replace(\n,‘‘).replace(\t,‘‘)
            keyword_english = pinyin.get(line, format="strip")
            fp.write(%s\n % keyword_english)
    face_ID_index = 0

    dir = "./stars_srcimg/"
    
    # if os.path.exists(dir):
    #     os.system("rm -rf " + dir)

    if not os.path.exists(dir):
        os.mkdir(dir)

    pages = 2
    maxnum = pages * 30
    print(maxnum)

    for line in stars_list:
        #line.decode(‘utf-8‘).encode(‘gb2312‘)
        line = line.replace(\r,‘‘).replace(\n,‘‘).replace(\t,‘‘)
        keyword = line
        print keyword
        keyword_english = pinyin.get(keyword, format="strip")
        print keyword_english
        facesavepath = dir + str(face_ID_index) + "_" + keyword_english
        face_ID_index += 1
        print facesavepath
        if not os.path.exists(facesavepath):
            os.mkdir(facesavepath)
        else:
            print(keyword, " exist")
            continue

        dataList = getManyPages(keyword, pages)  # 参数1:关键字,参数2:要下载的页数
        getImg(dataList, facesavepath, keyword_english) # 参数2:指定保存的路径

 

python 百度图片爬虫

标签:百度   exist   requests   download   down   .net   rm -rf   format   os.path   

原文地址:https://www.cnblogs.com/adong7639/p/9074012.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!