码迷,mamicode.com
首页 > 数据库 > 详细

python 爬虫之requests+日志+配置文件读取+mysql入库

时间:2018-10-12 14:04:15      阅读:449      评论:0      收藏:0      [点我收藏+]

标签:类型   需要   any   argv   ror   efault   爬虫   datetime   网站   


#
!/usr/bin/env python # -*- coding: utf-8 -*-
# 日志管理 import logging import sys reload(sys) sys.setdefaultencoding(utf-8) def getlogger(logName, logFile): logger=logging.getLogger(logName) logger.setLevel(logging.DEBUG) screenHandle = logging.StreamHandler() screenHandle.setLevel(logging.DEBUG) fileHandle = logging.FileHandler(logFile,a) fileHandle.setLevel(logging.DEBUG) formatter = logging.Formatter(%(asctime)s - %(name)s - %(levelname)s - %(message)s) screenHandle.setFormatter(formatter) fileHandle.setFormatter(formatter) logger.addHandler(fileHandle) logger.addHandler(screenHandle) return logger

mysql.conf 

[mysql]
user=你的root
password=你的password
database=你的database
host=localhost
port =3306

requests_to_mysql.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import ConfigParser
import json
import random
import sys
import time
import pymysql
import requests
import log_config
import datetime

logger = log_config.getlogger(reference_mysql, reference_mysql.log)
conf = ConfigParser.ConfigParser()
conf.read("mysql.conf")
user = conf.get("mysql", "user")
password = conf.get("mysql", "password")
database = conf.get("mysql", "database")
host = conf.get("mysql", "host")
port = conf.get("mysql", "port")
siteURL = ‘你要爬取得请求
fileurl = 可能爬取路径需要拼接的域名

headers = {Host: 爬取网站的域名,
           User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
                          Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3103.400 QQBrowser/9.6.11372.400}
#你爬取的网站可能有很多层条件去过滤,所以你都需要列举处理,一般也包括时间段
cate_dict = {‘key‘:‘value‘}

moudue_dict = {key: value}

industry_dict = {‘key‘:‘value‘}

date_list = [‘2018-10-10‘]

date = time.strftime(%Y-%m-%d, time.localtime(time.time()))
logger.info("start get %s data" % date)
# 启动参数决定是否爬取今天的还是所有的历史数据sys.argv为list,启动不带参数sys.argv[0]默认为当前文件所在位置
if len(sys.argv) != 1:
    if sys.argv[1] == all:
        date = ‘‘
    else:
        logger.info(input error,please input all)
        exit()


# 获取总页数
def get_page(dates, category, mod, industry):
    data = {seDate: dates,
            pageNum: 1,
            pageSize: 30,
            category: cate_dict[category],
            column: szse,
            plate: mod,
            tabName: fulltext,
            trade: industry}
    req = requests.post(siteURL, headers=headers, data=data)
    content = req.text
    content = json.loads(content)
    # filelist = content[‘announcements‘]
    filesum = content[totalAnnouncement]
    # print filesum
    if filesum != 0:
        if filesum % 30 == 0:
            pages = filesum / 30
        else:
            pages = filesum / 30 + 1
        return pages
    else:
        return 0


# 获取一页数据
def get_page_data(dates, category, page, module_type, industry):
    # 当前时间必须通过下面方式获取,否者mysql datetime类型不能接受该参数
    now_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    data = {seDate: dates,
            pageNum: page,
            pageSize: 30,
            category: cate_dict[category],
            column: szse,
            plate: module_type,
            tabName: fulltext,
            trade: industry}
    logger.info("getting page %s" % str(page))
    retries = 0
    content = ""
    while retries < 3:
        try:
            req = requests.post(siteURL, headers=headers, data=data)
            content = req.text
            break
        except Exception as e:
            logger.error("get data failed", e)
            retries += 1
            logger.info(req error retry %s  % retries)
            # logger.info(‘req error retry %s ‘%retries)
            t = random.uniform(1, 2)
            time.sleep(t)
    try:
        content = json.loads(content)
        filelist = content[announcements]
        logger.info("filelist=%s" % len(filelist))
        page_datas = []
        for fileone in filelist:
            # 文件处理状态,mysql中的
            pro_status = 0
            # java中解析url重试次数,这里不用管,默认设为0
            retry_count = 0
            sec_code = fileone[secCode]
            sec_name = fileone[secName]
            announcement_title = fileone[announcementTitle]
            announcement_time = fileone[announcementTime]
            public_time = date_long_to_str(announcement_time)
            adjunct_url = fileurl + fileone[adjunctUrl]
            page_data = [category, cate_dict[category], industry_dict[industry], module_type, public_time, public_time,
                         sec_code, sec_name, announcement_title, adjunct_url, pro_status, retry_count,
                         now_date, now_date]
            page_datas.append(page_data)
        if len(page_datas) > 0:
            set_data_mysql(page_datas)

    except Exception as e:
        logger.error(
            get this page detail error... [cat: + category +   industry: + industry + ‘‘
              module_type: + module_type +   date: + dates + ], e)


# 批量插入mysql
def set_data_mysql(page_datas):
    # 创建连接
    conn = pymysql.connect(host=host, port=int(port), user=user, passwd=password, db=database)
    # 创建游标
    cursor = conn.cursor()
    sql = "INSERT INTO test(这里有14个字段) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    effect_row = cursor.executemany(sql, page_datas)
    # 提交sql,不提交不会进入mysql
    conn.commit()
    logger.info("already into dabatabase %s" % effect_row)
    # # 下面两行是单行插入
    # # listOne = (‘年度报告‘, ‘category_ndbg_szsh;‘, dt)
    # # effect_row = cursor.execute(sql, listOne)
    # conn.commit() #需要提交来进入数据库
    # print effect_row


# long转str类型时间1539187200000  1539001526000->2018-10-08 20:25:26
def date_long_to_str(long_date):
    if long_date == "" or long_date == 0:
        return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    fommat_time = time.localtime(long(long_date)/1000)
    time_str = time.strftime("%Y-%m-%d %H:%M:%S", fommat_time)
    return time_str


# 全局循环爬取
def collect_cate():
    if date == ‘‘:
        for seDate in date_list:
            for mod in moudue_dict:
                for category in cate_dict:
                    for industry in industry_dict:
                        #logger.info("category=%s, mod=%s, industry=%s" % (category, mod, industry))
                        pages = get_page(seDate, category, moudue_dict[mod], industry)
                        #logger.info("pages = %s" % pages)
                        for page in range(1, pages + 1):
                            get_page_data(seDate, category, page, moudue_dict[mod], industry)
    else:
        for mod in moudue_dict:
            for category in cate_dict:
                for industry in industry_dict:
                    #logger.info("category = %s, mod=%s, industry=%s" % (category, mod, industry))
                    pages = get_page(date, category, moudue_dict[mod], industry)
                    #logger.info("pages = %s" % pages)
                    if 0 != pages:
                        for page in range(1, pages + 1):
                            get_page_data(date, category, page, moudue_dict[mod], industry)


if __name__ == "__main__":
    collect_cate()

 

python 爬虫之requests+日志+配置文件读取+mysql入库

标签:类型   需要   any   argv   ror   efault   爬虫   datetime   网站   

原文地址:https://www.cnblogs.com/keepMoveForevery/p/9777155.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!