爬虫初识(爬取dytt电影列表及下载地址)

时间：2018-10-12 21:21:10 阅读：187 评论：0 收藏：0 [点我收藏+]

标签：link rom decode read int turn def net find

import re
from  urllib.request import urlopen
def getPage(url):
    response=urlopen(url)
    return response.read().decode(‘gbk‘,errors=‘ignore‘)
def parsePage(s):
    com=re.compile(r‘<td height="26">.*?<b>.*?<a href="(?P<url_name>.*?)" class="ulink">.*?‘,re.S)
    ret=com.finditer(s)
    for i  in  ret :
        return "http://www.dytt8.net"+i.group("url_name")
def parsePage1(s):
    com=re.compile(r‘<div id="Zoom">.*?译.*?名(?P<name>.*?)<br />◎片.*?名(?P<pianname>.*?)<br />.*?◎导.*?演(?P<daoyan>.*?)<br />‘+
‘◎主.*?演(?P<zhuyan>.*?)<br /><br />◎简.*?介.*?<td.*?><a href="(?P<xiazaidizhi>.*?)">‘,re.S)
    ret1=com.finditer(s)
    # print(‘****************************************************************‘)
    for i  in  ret1 :
        yield {"yiming":(re.sub("[\u3000]", "",i.group(‘name‘))),
                "pianming":re.sub("[\u3000]", "",i.group("pianname")),
                "daoyan":re.sub("[\u3000]", "",i.group("daoyan")),
                "zhuyan":re.sub("[\u3000]", "",i.group("zhuyan")),
                "xiazaidizhi":re.sub("[\u3000]", "",i.group("xiazaidizhi"))}
def main(num):
    url="http://www.dytt8.net/html/gndy/dyzz/list_23_%s.html" % num
    response_html=getPage(url)
    xiangqing=parsePage(response_html)
    response1_html = getPage(xiangqing)
    ret=parsePage1(response1_html)
    f = open("move_list", "a", encoding="utf8")
    for obj in ret:
        print(obj)
        data = str(obj)
        f.write(data + "\n")
for i in range(1,181):
    main(i)

标签：link rom decode read int turn def net find

原文地址：https://www.cnblogs.com/zhoushibin-1/p/9780285.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行