多进程爬虫 发表于 2018-02-21 | 分类于 Python3 单进程的收集数据浪费的时间较长,这时候我们可以多开几个进程去访问那些网站,提高了效率 多进程的使用方法:1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 from multiprocessing import Pool pool=Pool(processes=4) pool.map(func,iterable)``` ### 实例(爬取糗事网站)``` bashimport reimport timefrom multiprocessing import Poolimport requestsheaders={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'}def re_script(url): res=requests.get(url,headers=headers) names=re.findall('<h2>(.*?)</h2>',res.text,re.S) contents = re.findall('<div class="content">.*?<span>(.*?)</span>', res.text, re.S) laughs = re.findall('<i class="number">(\d+)</i>',res.text,re.S) comments = re.findall('<i class="number">(\d+)</i>', res.text, re.S) for name,content,laugh,comment in zip(names,contents,laughs,comments): info={ 'name':name.strip(), 'content':content.strip(), 'laugh':laugh .strip(), 'comment':comment .strip() } print(info)if __name__=="__main__": urls=['https://www.qiushibaike.com/8hr/page/{}/'.format(str(i)) for i in range(1,3)] start_1=time.time() for url in urls: re_script(url) end_1=time.time() print('这是一个接着一个的耗时:',end_1-start_1) start_2=time.time() pool=Pool(processes=2) pool.map(re_script,urls) end_2=time.time() print('两个进程所耗时间: ', end_2-start_2) start3=time.time() pool=Pool(processes=4) pool.map(re_script,urls) end3=time.time() print('四个进程耗时:',end3-start3)