多线程:
import threading
from multiprocessing import Queue
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import re
class myThread(threading.Thread):
def __init__(self, qlock, queue):
threading.Thread.__init__(self)
self.qlock = qlock
self.queue = queue
def run(self):
process(self.qlock, self.queue)
def process(qlock, queue):
qlock.acquire() # 互斥锁
try:
data = queue.get() # 获取队列
print(data)
finally:
qlock.release() # 释放锁
sleep(1)
# 建立队列
workQueue = Queue(50)
qlock = threading.Lock()
url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
r = get(url, timeout=1)
html = r.text
soup = BeautifulSoup(html,‘lxml‘)
urls = soup.find_all(‘img‘)
links = []
for url in urls:
r = re.compile(r‘data-src="(.+?)"‘)
link = r.findall(str(url))
workQueue.put(link) # 写入队列
links.append(link)
threads = []
for url in links:
thread = myThread(qlock, workQueue)
thread.daemon = True
thread.start()
threads.append(thread)
# 清空队列
while not workQueue.empty():
pass
# 等待线程结束
for t in threads:
t.join()
多进程:
1.使用Pool模块创建进程池:
from multiprocessing import Pool
from bs4 import BeautifulSoup
from requests import get
import re
import os
def run_process(url):
print(url)
if __name__ == ‘__main__‘:
url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
html = get(url, timeout=1).text
soup = BeautifulSoup(html, ‘lxml‘)
urls = soup.find_all(‘img‘)
links = []
for u in urls:
r = re.compile(r‘data-src="(.+?.jpg)"‘)
link = r.findall(str(u))
links.append(link)
process = Pool(os.cpu_count()) # cpu核个数
for u in links:
process.apply_async(run_process,args=(u,))
process.close()
process.join()
2.Process模块、Queue模块进行进程间的通信(但我的写入队列没有用多进程):
from multiprocessing import Process, Queue
from bs4 import BeautifulSoup
from requests import get
import re
class myProcess(Process):
def __init__(self, queue):
Process.__init__(self)
self.queue = queue
def run(self):
run_process(self.queue)
def run_process(queue):
data = queue.get()
print(data)
if __name__ == ‘__main__‘:
url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
html = get(url, timeout=1).text
soup = BeautifulSoup(html, ‘lxml‘)
urls = soup.find_all(‘img‘)
queue = Queue(50)
links = []
for u in urls:
r = re.compile(r‘data-src="(.+?.jpg)"‘)
link = r.findall(str(u))
queue.put(link)
links.append(link)
for u in links:
process = myProcess(queue)
process.start()
while not queue.empty():
pass
process.join()
第2个比第1个明显慢了很多,不知道为什么...
但上面只是cpu密集型,测试一下用io密集型的小爬虫来看看效果:
1.多线程:
import threading
from multiprocessing import Queue
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import re
class myThread(threading.Thread):
def __init__(self, qlock, queue):
threading.Thread.__init__(self)
self.qlock = qlock
self.queue = queue
def run(self):
process(self.qlock, self.queue)
def process(qlock, queue):
qlock.acquire() # 互斥锁
try:
url = queue.get()[0] # 获取队列
img = get(url,timeout=1).content
name = url.split(‘/‘)[-1]
imgid = name[:8]
with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp:
fp.write(img)
print(‘download: ‘ + url)
finally:
qlock.release() #
sleep(1)
# 建立队列
workQueue = Queue(50)
qlock = threading.Lock()
url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
html = get(url, timeout=1).text
soup = BeautifulSoup(html,‘lxml‘)
urls = soup.find_all(‘img‘)
links = []
for u in urls:
r = re.compile(r‘data-src="(.+?.jpg)"‘)
link = r.findall(str(u))
workQueue.put(link) # 写入队列
links.append(link)
threads = []
for u in links:
thread = myThread(qlock, workQueue)
thread.start()
threads.append(thread)
# 清空队列
while not workQueue.empty():
pass
# 等待线程结束
for t in threads:
t.join()
2.多进程:
from multiprocessing import Process, Queue
from bs4 import BeautifulSoup
from requests import get
import re
class myProcess(Process):
def __init__(self, queue):
Process.__init__(self)
self.queue = queue
def run(self):
run_process(self.queue)
def run_process(queue):
url = queue.get()[0] # 获取队列
img = get(url, timeout=1).content
name = url.split(‘/‘)[-1]
imgid = name[:8]
with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp:
fp.write(img)
print(‘download: ‘ + url)
if __name__ == ‘__main__‘:
url = ‘https://www.pixiv.net/ranking.php?mode=daily‘
html = get(url, timeout=1).text
soup = BeautifulSoup(html, ‘lxml‘)
urls = soup.find_all(‘img‘)
queue = Queue(50)
links = []
for u in urls:
r = re.compile(r‘data-src="(.+?.jpg)"‘)
link = r.findall(str(u))
queue.put(link)
links.append(link)
for u in links:
process = myProcess(queue)
process.start()
while not queue.empty():
pass
process.join()
最后,感觉运行时间都差不多...还是看不太出来差距。