标签:params 页码 复数 page 链接 with dex off ext
#将每一个页码对应的图片数据进行下载
if not os.path.exists(‘./imgLib‘):
    os.mkdir(‘./imgLib‘)
    
#定义一个通用的url模板
url_model = ‘http://sc.chinaz.com/tag_tupian/OuMeiMeiNv_%d.html‘
for page in range(1,5):
    print(‘正在下载第%d页的数据......‘%page)
    if page == 1:
        url = ‘http://sc.chinaz.com/tag_tupian/OuMeiMeiNv.htm‘
    else:
        url = format(url_model%page)
    page_text = requests.get(url=url,headers=headers).text
    #使用正则将图片地址进行提取
    re_ex = ‘<a target="_blank".*?<img src2="(.*?)" alt.*?</a>‘
    #正则在解析数据时,遇到换行会停止匹配数据。re.S就可以让正则遇到换行不停止匹配
    #在爬虫中必须要使用re.S
    img_src = re.findall(re_ex,page_text,re.S)
    for src in img_src:
        img_name = src.split(‘/‘)[-1]
        img_path = ‘./imgLib/‘+img_name
        img_data = requests.get(url=src,headers=headers).content
        with open(img_path,‘wb‘) as fp:
            fp.write(img_data)
        print(img_name,‘下载成功!‘)
from bs4 import BeautifulSoup
fp = open(‘./test.html‘,‘r‘,encoding=‘utf-8‘)
soup = BeautifulSoup(fp,‘lxml‘)
# print(soup) #返回的是加载到该对象中的页面源码数据
soup.title
soup.div
soup.find(‘div‘,class_=‘song‘)
soup.find_all(‘div‘,class_=‘song‘)
soup.select(‘.song‘)
soup.select(‘#feng‘)
soup.select(‘.tang > ul > li > a ‘)
soup.select(‘.tang a‘)
soup.title.string
soup.title.text
soup.find(‘div‘,class_=‘song‘).text
soup.find(‘a‘,id="feng")[‘href‘]
url = ‘http://www.shicimingju.com/book/sanguoyanyi.html‘
page_text = requests.get(url=url,headers=headers).text
#数据解析
soup = BeautifulSoup(page_text,‘lxml‘)#soup只可以解析首页的内容
a_list = soup.select(‘.book-mulu > ul > li > a‘)
fp = open(‘./sanguo.txt‘,‘w‘,encoding=‘utf-8‘)
for a in a_list:
    title = a.string
    detail_url = ‘http://www.shicimingju.com‘+a[‘href‘]
    detail_page_text = requests.get(url=detail_url,headers=headers).text
    #数据解析:解析章节内容
    detail_soup = BeautifulSoup(detail_page_text,‘lxml‘)
    div_tag = detail_soup.find(‘div‘,class_=‘chapter_content‘)
    content = div_tag.text
    fp.write(title+‘:‘+content+‘\n‘)
    print(title,‘已经下载成功!!!‘)
fp.close()
# 将糗事百科中的段子标题和内容进行解析爬取
url_model = ‘https://www.qiushibaike.com/text/page/%d/‘
for page in range(1,4):
    url = format(url_model%page)
    page_text = requests.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)
    #列表中每一个div标签都包含了我们要解析的内容
    #xpath是在做全局数据解析
    div_list = tree.xpath(‘//div[@class="col1 old-style-col1"]/div‘)
    for div in div_list:
        #div表示:div是一个Element类型的对象,表示就是页面中的一个指定的div标签
        #div:html源码中的一个局部数据
        #局部数据解析中:./表示xpath方法调用者表示的标签
        author = div.xpath(‘./div[1]/a[2]/h2/text()‘)[0] #局部数据解析,只可以定位div这个局部数据中的相关标签
        content = div.xpath(‘./a[1]/div/span//text()‘)
        content = ‘‘.join(content)
        print(author,content)
# xpath图片数据爬取
import os
from urllib import request
dirName = ‘imgLibs‘
if not os.path.exists(dirName):
    os.mkdir(dirName)
    
url = ‘http://pic.netbian.com/4kmeinv/‘
response = requests.get(url,headers=headers)
response.encoding = ‘gbk‘
page_text = response.text
tree = etree.HTML(page_text)
#解析图片名称+图片链接
li_list = tree.xpath(‘//*[@id="main"]/div[3]/ul/li‘)
for li in li_list:#局部数据解析,一定要使用./操作
    img_name = li.xpath(‘./a/img/@alt‘)[0]+‘.jpg‘
    img_src = ‘http://pic.netbian.com‘+li.xpath(‘./a/img/@src‘)[0]
    img_path = dirName+‘/‘+img_name #图片存储路径
    request.urlretrieve(img_src,img_path)
    print(img_name,‘下载成功!!!‘)
main_url = ‘https://openapi.vmall.com/mcp/offlineshop/getShopList‘
data = {"portal":2,"lang":"zh-CN","country":"CN","brand":1,"province":"河北","city":"邯郸","pageNo":1,"pageSize":40}
main_json_data = requests.post(url=main_url,headers=headers,json=data).json()
for dic in main_json_data[‘shopInfos‘]:
    id_ = dic[‘id‘]
    url = ‘https://openapi.vmall.com/mcp/offlineshop/getShopById‘
    params = {
        ‘portal‘: ‘2‘,
        ‘version‘: ‘10‘,
        ‘country‘: ‘CN‘,
        ‘shopId‘: id_,
        ‘lang‘: ‘zh-CN‘,
    }
    json_data = requests.get(url=url,headers=headers,params=params).json()
    address = json_data[‘shopInfo‘][‘address‘]
    time_ = json_data[‘shopInfo‘][‘serviceTime‘]
    print(address,time_)
#爬取城市名称
url = ‘https://www.aqistudy.cn/historydata/‘
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
all_cities = tree.xpath(‘//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()‘)
all_cities
标签:params 页码 复数 page 链接 with dex off ext
原文地址:https://www.cnblogs.com/freedom0923/p/13155959.html