python推荐淘宝物美价廉商品 2.0

时间：2017-01-09 22:13:43 阅读：540 评论：0 收藏：0 [点我收藏+]

标签：init sqrt 中文 rand except level item color blog

改动：

新增功能：可选择只看天猫或淘宝

代码模块化封装，参数配置或输入单独在一个py文件管理，主函数功能只留出参数传入在setting配置的py文件里。

main.py代码：

  1 # -*- coding: utf-8 -*-
  2 import urllib
  3 import urllib2
  4 import re
  5 import time 
  6 import random
  7 import os
  8 from math import log
  9 from math import log10
 10 from math import sqrt
 11 import sys
 12 
 13 import setting
 14 
 15 ‘‘‘在Python自己IDE上要注释掉一下两行‘‘‘
 16 reload(sys)  
 17 sys.setdefaultencoding(‘utf8‘)  # python2.x的的defaultencoding是ascii
 18 
 19 class counter(object):
 20     #计数器
 21     def __init__(self):
 22         self.count  = 0
 23         self.try_time = 0
 24         self.fail_time = 0
 25         self.url_list = []
 26         self.new_flag = True
 27         self.results=[]
 28         self.priSu=0
 29         self.descSu=0
 30         self.tm_tb = ‘‘
 31 
 32     def print_counter(self):
 33         print ‘try_time:‘, self.try_time,   "  get_count:" , self.count,   "  fail_time:",self.fail_time
 34 
 35 counter1 = counter()
 36 
 37 
 38 def post_request(url):
 39     ‘‘‘
 40     #使用代理
 41     proxy = {‘http‘:‘27.24.158.155:84‘}
 42     proxy_support = urllib2.ProxyHandler(proxy)
 43     # opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
 44     opener = urllib2.build_opener(proxy_support)
 45     urllib2.install_opener(opener)
 46     ‘‘‘
 47 
 48     #构造随机头部文件访问请求
 49     User_Agents=["Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
 50     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
 51     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", #
 52     "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
 53     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11O"
 54     ]
 55     random_User_Agent = random.choice(User_Agents)
 56     #print random_User_Agent
 57 
 58     req =urllib2.Request(url) #！！
 59 
 60     req.add_header("User-Agent",random_User_Agent)
 61     req.add_header("GET",url)
 62     req.add_header("Referer",url)
 63     return req
 64 
 65 
 66 def recommend_rate(price,description,delivery,service,comments):
 67     #描述为绝对值
 68     av_p=counter1.priSu/counter1.count
 69     av_d=counter1.descSu/counter1.count
 70     rate=(description/av_d)**20*(description+delivery+service)*(av_p/(price))**0.1+log((comments+5),1000)
 71     #print ‘all count=‘,counter1.count
 72     #print "avrage price=",av_p,‘;‘,av_p/(price),‘;price‘,price,‘;comments=‘,comments,‘;descrip=‘,description
 73     #print ‘rate=‘,rate,‘(price)yinzi‘,(av_p/(price))**0.1,‘descrip_yinzi‘,(description/av_d)**20,‘comments_factor=‘,log((comments+50),100)
 74     return rate
 75 
 76 
 77 def product_rank(list):
 78     for x in list:
 79         #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况3个、x8服务情况 
 80         rate=recommend_rate(x[3],x[7],x[6],x[8],x[4])
 81         x.append(rate)
 82 
 83 
 84 def get_user_rate(item_url):
 85     #暂时未使用该功能
 86     ‘‘‘获取卖家信用情况；未登录情况不能访问，或者需要在头部文件中加入cookie。。。；‘‘‘
 87     html=urllib2.urlopen(item_url)
 88     #"//rate.taobao.com/user-rate-282f910f3b70f2128abd0ee9170e6428.htm"
 89     regrex_rate=‘"(//.*?user\-rate.*?)"‘
 90     codes= re.findall(regrex_rate,html.read())
 91     html.close()
 92 
 93     user_rate_url= ‘http:‘+codes[0]
 94     print ‘uu‘, user_rate_url
 95 
 96     user_rate_html = urllib2.urlopen(user_rate_url)
 97     print user_rate_html.read()
 98     #title="4.78589分"
 99     desc_regex=u‘title="(4.[0-9]{5}).*?‘
100     de_pat=re.compile(desc_regex)
101     
102     descs = re.findall(de_pat,user_rate_html.read())
103     print len(descs)
104     item_url=‘https://item.taobao.com/item.htm?id=530635294653&ns=1&abbucket=0#detail‘
105 #get_user_rate(item_url)
106 ‘‘‘获取卖家信用情况；未登录情况不能访问。。。暂时 无用‘‘‘
107 
108 
109 def makeNewdir(savePath):
110     while os.path.exists(savePath):
111         savePath = savePath+‘%s‘%random.randrange(1,10)
112         #print "the path exist,we‘ll make a new one"
113     try:
114         os.makedirs(savePath)
115         print ‘ok,file_path we reserve results:  %s‘%savePath
116         print ‘保存的路径为：‘.decode(‘utf-8‘)
117     except:
118         print "failed to make file path\nplease restart program"
119         print ‘创建文件夹失败，请重新启动程序‘.decode(‘utf-8‘)
120 
121 
122 def get_praised_good(url, file_open, keyword, counts, descripHrequ, servHrequ, descripNrequ):
123     #从给定的淘宝链接中 获取符合条件的商品list
124     html = urllib2.urlopen(post_request(url))
125     code = html.read()
126     html.close()
127 
128     regrex2=ur‘raw_title":"(.*?)","pic_url":"(.*?)","detail_url":"(.*?)","view_price":"(.*?)".*?"comment_count":"(.*?)".*?"nick":"(.*?)".*?"delivery":\[(.*?),(.*?),(.*?)\],"description":\[(.*?),(.*?),(.*?)\],"service":\[(.*?),(.*?),(.*?)\]‘ 
129     #每一个匹配项 返回  15个 字符串 
130     #x[0]开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况3个、x9描述相符情况3个、x12服务情况3个
131     pat = re.compile(regrex2)
132     meet_code = re.findall(regrex2,code)#
133 
134     for x in meet_code:
135         if counter1.count>=counts :
136             print "have get enough pruducts"
137             break
138         description_higher=int(x[10])*float(x[11])/100
139         service_higher=int(x[13])*float(x[14])/100
140         try:
141             x4=int(x[4]) #description_count
142         except:
143             x4=0
144 
145         #如果 只要淘宝 非天猫
146         if counter1.tm_tb == ‘taobao‘:
147             if counter1.tm_tb not in x[2].split(‘.‘):
148                 break
149 
150         if  (description_higher>=descripHrequ) and (service_higher>=servHrequ) and x4>=descripNrequ:
151             if re.findall(keyword,x[0]) : # 中文keyword在结果中匹配问题暂时没有解决，，直接加在搜索词里吧 
152                 x0=x[0].replace(‘ ‘,‘‘).replace(‘/‘,‘‘)
153                 detail_url=‘http:‘+x[2].decode(‘unicode-escape‘).encode(‘utf-8‘)
154                 x1=‘http:‘+x[1].decode(‘unicode-escape‘).encode(‘utf-8‘)
155                 #print type(x)
156                 if detail_url  in counter1.url_list:
157                     counter1.new_flag=False
158                     print ‘no more new met products‘
159                     print counter1.url_list
160                     print detail_url
161                     break
162                 counter1.url_list.append(detail_url)
163                 counter1.try_time+=1
164                 counter1.count+=1
165 
166                 x11=float(x[11])/100
167                 x9=float(x[9])/100
168                 x12=float(x[12])/100
169                 x6=float(x[6])/100
170                 x3=float(x[3])
171                 counter1.priSu += x3
172                 counter1.descSu += x9
173                 x5=unicode(x[5],‘utf-8‘)
174                                 
175                 result_list=[]
176                 result_list.append(x0)
177                 result_list.append(x1)
178                 result_list.append(detail_url)
179                 result_list.append(x3)
180                 result_list.append(x4)
181                 result_list.append(x5)
182                 result_list.append(x6)
183                 result_list.append(x9)
184                 result_list.append(x12)
185                 #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况 
186                 counter1.results.append(result_list)
187             
188 
189 def save_downpic(lis,file_open,savePath):
190     ‘‘‘从商品list下载图片到reserve_file_path，并写入信息至fileopen‘‘‘
191     #0开始为  x0商品名 、x1图片链接、x2商品链接、x3价格、x4评论数、 x5店铺名、 x6快递情况、x7描述相符情况、x8服务情况、x9:rate
192     len_list=len(lis)
193     print len_list
194     cc=0        
195     for x in lis:
196         try :
197             urllib.urlretrieve(x[1],savePath+‘\\%s___‘%cc +unicode(x[0],‘utf-8‘)+‘.jpg‘)
198 
199             txt_name = savePath+‘\\‘+ ‘%s__‘%cc+ ‘custome_description_%s __‘%x[7] +‘__comments_%s_‘%x[4]+ ‘___price_%srmb___‘%x[3] +x[5] +‘.txt‘
200                     
201             file_o = open(txt_name,‘a‘)
202             file_o.write(x[2])
203             file_o.close()
204             
205             print ‘\nget_one_possible_fine_goods:\n‘,‘good_name:‘,x[0].decode(‘utf-8‘)
206             print ‘rate=‘,x[9]
207             print ‘price:‘,x[3],x[5].decode(‘utf-8‘)
208             print ‘custome_description:‘,x[7],‘--‘,‘described_number:‘,x[4],‘  service:‘,x[8]
209             print x[2].decode(‘utf-8‘),‘\ngood_pic_url:‘,x[1].decode(‘utf-8‘)
210 
211             print txt_name
212             print cc+1,"th"
213 
214             file_open.write(u‘%s__‘%cc +u‘%s‘%x[0]+‘\nprice:‘+str(x[3])+‘￥,‘+‘\n‘+str(x[2])+‘  \n‘+str(x[5])+‘\ncustomer_description:‘+str(x[7])+‘described_number:‘+str(x[4])+‘\n\n\n‘)
215             
216             
217 
218             print ‘get one -^-‘
219         except :
220             print "failed to down picture or creat txt"
221             counter1.fail_time += 1
222         cc+=1
223         time.sleep(0.5)
224 
225 
226 def get_all_praised_goods(serchProd,counts,savePath ,keyword, price_min=0,price_max=0,descripHrequ =0,servHrequ=0 ,descripNrequ=0):
227     #边里搜索结果每一页
228     #initial url and page number
229     initial_url=‘https://s.taobao.com/search?q=‘+serchProd
230 
231     if counter1.tm_tb == ‘tmall‘:
232         initial_url = initial_url + ‘&filter_tianmao=tmall‘
233 
234     if  price_min :
235         if price_min < price_max :
236             initial_url = initial_url+‘&filter=reserve_price%5B‘+‘%s‘%price_min+‘%2C‘ +‘%s‘%price_max
237     initial_url = initial_url +‘%5D&s=‘
238 
239     #tian_mall = ‘https://list.tmall.com/search_product.htm?q=‘
240 
241     print "initial_url",initial_url
242     page_n=0
243     reserve_file=savePath+r‘\found_goods.txt‘
244     file_open=open(reserve_file,‘a‘)
245 
246     file_open.write(‘****************************\n‘)
247     file_open.write(time.ctime())
248     file_open.write(‘\n****************************\n‘)
249 
250     while counter1.new_flag and counter1.count<counts :
251         
252         url_1=initial_url+‘%s‘%(44*page_n)
253         #print initial_url
254         print ‘url_1:‘, url_1
255         #print ‘ss‘,initial_url+‘%s‘%(44*page_n)
256         page_n += 1
257 
258         get_praised_good(url_1,file_open,keyword,counts,descripHrequ,servHrequ ,descripNrequ)
259         print "let web network rest for 2s lest  make traffic jams "
260         time.sleep(2)
261         # except:
262         print "%s"%page_n,"pages have been searched"            
263         if page_n >=11 :
264             print "check keyword,maybe too restrict"
265             break
266     print url_1        
267     product_rank(counter1.results)
268 
269     counter1.results.sort(key=lambda x :x[9],reverse=True)        
270 
271     save_downpic(counter1.results,file_open,savePath)
272     
273     #
274     for a in  counter1.results:
275         for b in a :
276             file_open.write(unicode(str(b),‘utf-8‘))
277             file_open.write(‘\t‘)
278         file_open.write(‘\n\n‘)
279     
280     file_open.close()
281     counter1.print_counter()
282 
283 
284 def main():
285     print ‘说明：\n本程序用于在淘宝上搜索商品时主动通过 价格范围、商品描述、服务态度、评论数来筛选商品;\n筛选出来的商品图片下载保存到磁盘（默认桌面新建find_worty_goods文件夹）并建立同序号开头的txt文件，图片显示商品，其旁的txt文件名显示价格等关键信息，txt里保存商品的淘宝链接‘.decode(‘utf-8‘)    
286     if setting.userDefine:        #自己输入 配置参数-筛选要求
287         setting.inputPara() 
288                     #否则  使用setting中的配置参数
289 
290     serchProd   = setting.serchProd          #淘宝搜索词
291     keyword     = setting.keyword                 #raw_input().decode("gbk").encode("utf-8")        #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
292     price_min   = setting.price_min            #价格区间
293     price_max   = setting.price_max
294     descripHrequ = setting.descripHrequ    # %   默认高于average, 输出结果大于此值
295     servHrequ    = setting.servHrequ          # %  默认高于average, 输出结果大于此值
296     descripNrequ = setting.descripNrequ
297     counts          = setting.counts                #要求选出多少个商品
298     counter1.tm_tb = setting.tm_tb           #不区分天猫淘宝则，字符串为空，，只要天猫 则 =‘tmall‘ ,只要淘宝 = ‘taobao‘
299 
300     #savePath=r"C:\Users\Administrator\Desktop\Python scrapy\find_worthy_goods\results"#结果保存路径
301     savePath=u"results%s"%serchProd #结果保存路径
302     makeNewdir(savePath)
303     
304     get_all_praised_goods(serchProd, counts, savePath, keyword, price_min, price_max ,descripHrequ ,servHrequ ,descripNrequ)
305 
306 
307 if __name__=="__main__" :
308     main()
309 
310

View Code

setting.py

# -*- coding: utf-8 -*-

userDefine = False
#筛选要求设置

serchProd=‘背包‘     #淘宝搜索词
keyword=‘‘                 #raw_input().decode("gbk").encode("utf-8")        #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
price_min=22            #价格区间
price_max=100
descripHrequ=0       # %   默认高于average, 输出结果大于此值
servHrequ=0          # %  默认高于average, 输出结果大于此值
descripNrequ=6
counts=25            #要求选出多少个商品
tm_tb =‘tmall‘       #不区分天猫淘宝则，字符串为空，，只要天猫 则 =‘tmall‘ ,只要淘宝 = ‘taobao‘


def inputPara():
    ‘‘‘ 用户选择是否自定义要求，根据要求进行获取商品，并按推荐排序输出‘‘‘
    print "please input reserch _goods_name"
    global serchProd , keyword , price_min, price_max, descripHrequ , servHrequ,  descripNrequ ,counts ,tm_tb

    serchProd=raw_input().replace(‘ ‘,‘‘)    #淘宝搜索词 ,并去除中间意外输入的空格

    if serchProd:
        
        print "if customise price_range ,decriptiom require .etc.\ninput Y/N \n default by : no price limit avarage than descriptiom,get 50 products \n 默认要求为：无价格限制，商品描述、快递、服务高于均值，获取50个商品。自定义要求请输入 ‘Y’ (区分大小写)".decode(‘utf-8‘)
        if raw_input() == ‘Y‘:
            print "\nplease input  _minimal price and _maximal price;   \ndefault by 0,10000\nnext by ‘enter‘key input nothing means by default,the same below "
            print ‘请输入价格范围 ；默认0-10000 ；两项用半角逗号","分隔 按回车键确认；什么也不输入代表使用默认值 ‘.decode(‘utf-8‘)
            try:
                price_min, price_max=input()
            except:
                print ‘not input or wrong number,use default range‘
                price_min, price_max = 0 ,10000
            
            #
            print ‘是否要求 只看天猫/正品保障  还是只看淘宝 \n 只看天猫输入 tmall ,只看淘宝输入taobao，都看则回车略过‘
            try:
                tm_tb=raw_input().decode("gbk").encode("utf-8")      #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
            except:
                tm_tb=‘‘
            #
                # #
            print "please input _keyword that goods name must include:\n(more than one keyword must use Regular Expression); default by no kewords"
            try:
                keyword=raw_input().decode("gbk").encode("utf-8")      #个人限定词，商品名字必须包含，防止淘宝推荐了其他相关词 (正则表达式). 为任意表示不作限制
            except:
                keyword=‘‘
            #    

            print "\nplease input  _description_higher_percent_require and _service_higher__percent_require\n range:(-100,100) ;   \ndefault by 0,0  I.e better than average"
            print ‘请输入商品描述、服务高于平均值的百分比-100 ~100‘.decode(‘utf-8‘)
                 # %   默认高于average, 输出结果大于此值
            try:
                descripHrequ,servHrequ=input()              
            except:
                print ‘not input or wrong number,use default range‘
                descripHrequ = 0  # %  默认高于average, 输出结果大于此值
                servHrequ = 0
            
            #    
            print "\nplease input description count limit,  default more than 5\n" ,‘输入最低商品评价数，默认大于5‘.decode(‘utf-8‘)
            try:
                descripNrequ=input()
            except :
                print ‘not input or wrong number,use default range‘
                descripNrequ=5
            #
                
                # print "\nIF customise file reserve path, Y or N  \ndefault/sample as:  C:\\Users\\Administrator\\Desktop\\find_worthy_goods\\results "
                # print ‘是否自定义保存文件目录 Y or N‘.decode(‘utf-8‘)
                # if raw_input()==‘Y‘:
                #     print "please input path that you want to reserve;  \n "    
                #     savePath = raw_input()
                # else:
                #     #savePath=r"C:\Users\Administrator\Desktop\find_worthy_goods\results"#结果保存路径        
            #
            print "\nplease input how many results you want,  default by 50\n" ,‘您要获取的商品数目，默认50‘.decode(‘utf-8‘)
            try:
                counts=input()
            except :
                counts=50
        else :
            counts =50
            keyword = ‘‘
            tm_tb = ‘‘
            price_min ,price_max ,descripHrequ ,servHrequ ,descripNrequ = 0,0,0,0,0
    else:
        print "no search goods，please restart"
        print ‘没有输入商品名称，请重新启动程序‘.decode(‘utf-8‘)

View Code

python推荐淘宝物美价廉商品 2.0

标签：init sqrt 中文 rand except level item color blog

原文地址：http://www.cnblogs.com/willowj/p/6266507.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行