码迷,mamicode.com
首页 > 数据库 > 详细

Python3.5爬取cbooo.cn数据并且同步到mysql中

时间:2017-11-01 16:33:34      阅读:306      评论:0      收藏:0      [点我收藏+]

标签:add   auto   try   code   str   def   ror   address   hang   

 

#!/usr/local/bin/python
# -*- coding: utf-8 -*-
# Python:                  3.5
# Author:                  wucl(),zhenghai.zhang
# Program:                 爬取CBO网站上所有电影的名称并写入数据库。
# Version:                 0.1
# History:                 2017.10.25

import requests,time, pymysql, re, datetime
from exchangelib import DELEGATE, Account, Credentials, Message, Mailbox, HTMLBody

host = xxx
user = ‘xxx
passwd = ‘xxx
dbme = crawl
dbtarget = back_brace
table = movie_hotwords
tabledelta = movie_hotwords_delta
tablesync = slot_value
port = 3306
tolist = [‘xxx@xxx.com]

def get_info():
    try:
        url = http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=1
        pData = requests.get(url).json()
        return pData[tPage], pData[tCount]
    except:
        print("获取总页数和总电影数失败")


def get_movies(page):
    try:
        url = http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex= + str(page)
        pData = requests.get(url).json()
        movies_list = pData[pData]
        return movies_list
    except:
        print(获取第%s页电影列表失败 % page)


def Movie_insert(host, user, passwd, dbme, port, table, movies_list):
    conn=pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
    cur=conn.cursor()
    new_movies = []
    punc = "!??"#$%&'()*+,-/:;<=>@[\]^_`{|}~?????、〃》「」『』【】〔〕〖〗?????〝〞????–—‘’?“”??…?﹏.()::。·"
    punctuation = punc
    for movie in movies_list:
        try:
            movie[MovieName] = re.sub(r"[%s]+" % punctuation, "", movie["MovieName"])
            cmd = insert into %s(movie_id, movie_name) values("%s", "%s") % (table, movie[ID], movie[MovieName])
            cur.execute(cmd)
            new_movies.append(movie)
        except pymysql.Error:
            print(" "*20, movie[MovieName], "already exists, skip……")
    cur.close()
    conn.commit()
    conn.close()
    return new_movies


def Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, movies_list, tablesync):
    conn = pymysql.connect(host=host, user=user, passwd=passwd, db=dbme, port=port, charset="utf8")
    cur = conn.cursor()
    cur.execute("delete from %s " % dbme+"."+tabledelta)

    for movie in movies_list:
        try:
            cmd = insert into %s(movie_id, movie_name) values("%s", "%s") % (tabledelta, movie[ID], movie[MovieName])
            cmdsync = insert into %s(slot_type_id, slot_value, create_by, modify_by, gmt_create, gmt_modify, out_value) values("%s", "%s", "%s", "%s", "%s", "%s", "%s") % (dbtarget+"."+tablesync, "xxxxxx", movie[MovieName], "system", "system", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),"")
            cur.execute(cmd)
            cur.execute(cmdsync)
        except pymysql.Error:
            print(" " * 20, movie[MovieName], "already exists, skip……")

    try:
        cmdbacktoskill = insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("BACKBRACE","testpass","SLOT","xxxxxx","init","SLOT_BACKBRACE_TESTPASS" ,"zhenghai.zhang","zhenghai.zhang","%s","%s") % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        cmdskilltoskillpro = insert into back_brace.release_task(app_type,app_status,type,ref_id,status,register_id,create_by,modify_by,gmt_create,gmt_modify) values("SKILL","deploy","SLOT","xxxxxx","init","SLOT_SKILL_DEPLOY" ,"zhenghai.zhang","zhenghai.zhang","%s","%s") % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        print(cmdbacktoskill)
        cur.execute(cmdbacktoskill)
        print(cmdskilltoskillpro)
        cur.execute(cmdskilltoskillpro)
    except pymysql.Error:
        print("write into back_brace.release_task error!!!")
    cur.close()
    conn.commit()
    conn.close()


def Email(to, subject, body):
    creds = Credentials(
        username=xxxxxx,
        password=‘xxxxxx)
    account = Account(
        primary_smtp_address=‘xxx@xxx.com,
        credentials=creds,
        autodiscover=True,
        access_type=DELEGATE)
    m = Message(
        account=account,
        subject=subject,
        body=HTMLBody(body),
        to_recipients=[Mailbox(email_address=to)])
    m.send_and_save()


if __name__ == __main__:
    update_movies = []
    pages, counts = get_info()
    pages = 1
    for i in range(1,pages + 1):
        print("*"*30,i,"*"*30)
        movies_list = get_movies(i)
        new_movies = Movie_insert(host, user, passwd, dbme, port, table, movies_list)
        for new_movie in new_movies:
            print(new_movie[MovieName],"Added")
            onemovie = {}
            onemovie["ID"] = new_movie["ID"]
            onemovie["MovieName"] = new_movie["MovieName"]
            update_movies.append(onemovie)
        time.sleep(1)
    print(update_movies)

    try:
        Movie_new_and_sync(host, user, passwd, dbme, dbtarget, port, tabledelta, update_movies, tablesync)  # 将增加的电影写入movie_hotwords_delta表中
    except:
        print("Movie update and sync Error!")

    subject = 本次新增电影名称
    body = "本次新增的电影名称为:<hr>"
    for movie in update_movies:
        body += movie["MovieName"] + "<br>"
    for to in tolist:
        Email(to, subject, body)

 

欢迎大侠指点

 

Python3.5爬取cbooo.cn数据并且同步到mysql中

标签:add   auto   try   code   str   def   ror   address   hang   

原文地址:http://www.cnblogs.com/zhzhang/p/7766928.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!