码迷,mamicode.com
首页 > 其他好文 > 详细

Chinadaily双语新闻爬取

时间:2018-05-04 17:06:02      阅读:225      评论:0      收藏:0      [点我收藏+]

标签:ima   www   文件   包括   __name__   nes   pre   break   代码   

今天临时需要爬取一些双语资料

(尚未清洗)

需要充分利用

下边代码是想拿到Chinadaily网页中每篇双语新闻的链接,首先研究这些网页的网址和网页结构,包括翻页一般是首页网址加上_2,_3...等等。所以以下代码只是拿到链接。

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
File: bi_news.py
Author: ZhangHaiou(hozhangel@126.com)
Date: 2018/05/04
"""

import urllib
import re
import os

bi_urls = []
def getHtml(url):    #读取网页内容
    page = urllib.urlopen(url)
    html = page.readlines()
    #print html
    return html

def getImg(html):
    reg = rsrc="(.+?\.jpg)" pic_ext
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,%s.jpg % x)
        x+=1
    
def geturl(html):   #读取网页中需要的链接
    for line in html:
        if re.search(\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm",line):
            if re.search(\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm",line):        #只是想拿到2016年之后的语料      
                os._exit(0)
            else:
                url = re.findall(r\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm,line)
                print("http://language.chinadaily.com.cn/" + url[0])
                bi_urls.append("http://language.chinadaily.com.cn/" + url[0])

                
if __name__ == __main__:        
    n = 1
    # os.system(‘wget -r --spider http://language.chinadaily.com.cn/news_bilingual.html‘)
    # #geturl(getHtml("http://language.chinadaily.com.cn/news_bilingual.html"))
    # ‘‘‘
    while n:
        if(n < 2):
            html = getHtml("http://language.chinadaily.com.cn/news_bilingual.html")
            
        elif(n > 1):
            html = getHtml("http://language.chinadaily.com.cn/news_bilingual_" + str(n) + ".html" )
        geturl(html)
        n = n + 1

 

执行python bi_news.py >url.txt 把想要的网址保存

url.txt内容:

技术分享图片

 

下一步是简单爬取把url中每行链接的网页内容,且把新闻按照月份整理进入文件夹,文件名是每个新闻链接的后面八位数字

 

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
File: content.py
Author: ZhangHaiou(hozhangel@126.com)
Date: 2018/05/04
"""

import urllib
import re
import os
import sys
bi_urls = []
def getHtml(url):
    page = urllib.urlopen(url)
    html = page.read()
    #print html
    return html

def getImg(html):
    reg = rsrc="(.+?\.jpg)" pic_ext
    imgre = re.compile(reg)
    imglist = re.findall(imgre,html)
    x = 0
    for imgurl in imglist:
        urllib.urlretrieve(imgurl,%s.jpg % x)
        x+=1
    
def geturl(html):
    for line in html:
        if re.search(\<div class="mr10"\>\<a href="\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm",line):
            if re.search(\<div class="mr10"\>\<a href="2016\-\d\d/\d\d/content\_\d{4,}.htm",line):                
                os._exit(0)
            else:
                url = re.findall(r\d\d\d\d\-\d\d/\d\d/content\_\d{4,}.htm,line)
                print(url)
                bi_urls.append(url)
def savefile(savepath, content):
    with open(savepath, "w") as fp:
        fp.write(content)
                
if __name__ == __main__:        

    for line in open(sys.argv[1],r):
        content = ""
        n = 1
        while n: #这个循环是为了不遗漏需要翻页的新闻
            if n > 1:
                htm = line + "_" + str(n)
            else:
                htm = line
            raw = getHtml(htm)
            
            if not re.findall(r<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">,raw): #避免空白页
                break
            print(htm)
            n = n + 1
            # for hang in raw:
                # if re.search(‘^\<p\>.*\<\/p\>‘,hang):
            content = content + raw
        date = re.findall(r\d\d\d\d\-\d\d,line)[0]
        filename = re.findall(r\d{6,},line)[0]
        if not os.path.exists(date):  # 是否存在目录
            os.makedirs(date)
        savefile(date + "/" + filename + ".txt" , content)
        
      

 

Chinadaily双语新闻爬取

标签:ima   www   文件   包括   __name__   nes   pre   break   代码   

原文地址:https://www.cnblogs.com/hozhangel/p/8990818.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!