from time import perf_counter
import requests
from scrapy.selector import Selector
from multiprocessing import Queue,Process,Pool
def RequestsDX(url): # 实例化requests对象方便后面调用
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
'application/'
'signed-exchange;v=b3;q=0.9'
}
response = requests.get(url=url , headers=headers)
response.encoding = 'utf-8'
return response
def SelectorDX(url): # 实例化selector对象方便后面调用
response = RequestsDX(url)
selector = Selector(text=response.text)
return selector
def category_page(q): # 获取分类下的每页的链接
q.put('http://www.ypppt.com/moban/shuzhi/')
args = list(range(2,100))
for i in args:
ppt_url2 = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
response = RequestsDX(ppt_url2)
if response.status_code == 200:
print(ppt_url2)
q.put(ppt_url2)
else:
break
def download_page_parse(q, url_q): # 分析每页的aid并合成每个ppt的下载页面的链接
while q.empty() is not True:
selector = SelectorDX(q.get())
page_id_sum = len(selector.xpath('/html/body/div[2]/ul/li'))
for i in range(1, page_id_sum) :
tag = selector.xpath('/html/body/div[2]/ul/li[%s]/a' % i).extract_first()
y = tag.split('<')
a = y[1].find('.html')
b = y[1].rfind('/')
# print(y[1][b+1:a])
id_url = 'http://www.ypppt.com/p/d.php?aid='+y[1][b + 1 :a]
print(id_url)
url_q.put(id_url)
def download_url_parse(url_q, download_q): # 分析每个ppt下载页面的下载链接以及ppt名字
download_list = []
while url_q.empty() is not True :
selector = SelectorDX(url_q.get())
xpath = '/html/body/div/div/ul/li[1]/a'
filename_xpath = '/html/body/div/div/div[2]/div[2]/h1'
url_download = selector.xpath(xpath).extract_first()
file_name = selector.xpath(filename_xpath).extract_first()
name1 = file_name.replace(' - 下载页', '')
name2 = name1.replace('<h1>', '')
name = name2.replace('</h1>', '')
a = url_download.find('"')
b = url_download.rfind('"')
download_list.append((name,url_download[a + 1 :b]))
print(download_list)
download_q.put(download_list)
def down_load(download_list): # 开始下载
response = RequestsDX(download_list[1])
print('=' * 100)
print('正在下载', download_list[0])
with open(r'D:\ppt\%s.zip' % download_list[0], 'wb') as f :
for chunk in response.iter_content(chunk_size=1024) :
f.write(chunk)
print('下载完成')
if __name__ == '__main__':
t = perf_counter()
q = Queue()
url_q = Queue()
download_q = Queue()
p1 = Process(target=category_page, args=(q,))
p2 = Process(target=download_page_parse, args=(q,url_q,))
p3 = Process(target=download_url_parse, args=(url_q,download_q,))
p_l = [p1, p2, p3]
for i in p_l:
i.start()
i.join()
download_list = download_q.get()
pool = Pool(10)
pool.map(down_load, download_list)
t1 = perf_counter()
cost = t1-t
print(cost,'s')
爬取PPT模版
如果文章对你有帮助,欢迎点击上方按钮打赏作者