爬取PPT模版

from time import perf_counter
import requests
from scrapy.selector import Selector
from multiprocessing import Queue,Process,Pool
 
def RequestsDX(url): # 实例化requests对象方便后面调用
    headers = {
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.51',
        'Accept'     : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,'
                       'application/'
                       'signed-exchange;v=b3;q=0.9'
    }
    response = requests.get(url=url , headers=headers)
    response.encoding = 'utf-8'
    return response
 
def SelectorDX(url): # 实例化selector对象方便后面调用
    response = RequestsDX(url)
    selector = Selector(text=response.text)
    return selector
 
def category_page(q): # 获取分类下的每页的链接
    q.put('http://www.ypppt.com/moban/shuzhi/')
    args = list(range(2,100))
    for i in args:
        ppt_url2 = 'http://www.ypppt.com/moban/shuzhi/list-%s.html' % i
        response = RequestsDX(ppt_url2)
        if response.status_code == 200:
            print(ppt_url2)
            q.put(ppt_url2)
        else:
            break
 
def download_page_parse(q, url_q): # 分析每页的aid并合成每个ppt的下载页面的链接
    while q.empty() is not True:
        selector = SelectorDX(q.get())
        page_id_sum = len(selector.xpath('/html/body/div[2]/ul/li'))
        for i in range(1, page_id_sum) :
            tag = selector.xpath('/html/body/div[2]/ul/li[%s]/a' % i).extract_first()
            y = tag.split('<')
            a = y[1].find('.html')
            b = y[1].rfind('/')
            # print(y[1][b+1:a])
            id_url = 'http://www.ypppt.com/p/d.php?aid='+y[1][b + 1 :a]
            print(id_url)
            url_q.put(id_url)
 
def download_url_parse(url_q, download_q): # 分析每个ppt下载页面的下载链接以及ppt名字
    download_list = []
    while url_q.empty() is not True :
        selector = SelectorDX(url_q.get())
        xpath = '/html/body/div/div/ul/li[1]/a'
        filename_xpath = '/html/body/div/div/div[2]/div[2]/h1'
        url_download = selector.xpath(xpath).extract_first()
        file_name = selector.xpath(filename_xpath).extract_first()
        name1 = file_name.replace(' - 下载页', '')
        name2 = name1.replace('<h1>', '')
        name = name2.replace('</h1>', '')
        a = url_download.find('"')
        b = url_download.rfind('"')
        download_list.append((name,url_download[a + 1 :b]))
    print(download_list)
    download_q.put(download_list)
 
def down_load(download_list): # 开始下载
    response = RequestsDX(download_list[1])
    print('=' * 100)
    print('正在下载', download_list[0])
    with open(r'D:\ppt\%s.zip' % download_list[0], 'wb') as f :
        for chunk in response.iter_content(chunk_size=1024) :
            f.write(chunk)
    print('下载完成')
 
if __name__ == '__main__':
    t = perf_counter()
    q = Queue()
    url_q = Queue()
    download_q = Queue()
 
    p1 = Process(target=category_page, args=(q,))
    p2 = Process(target=download_page_parse, args=(q,url_q,))
    p3 = Process(target=download_url_parse, args=(url_q,download_q,))
    p_l = [p1, p2, p3]
 
    for i in p_l:
        i.start()
        i.join()
 
    download_list = download_q.get()
 
    pool = Pool(10)
    pool.map(down_load, download_list)
 
    t1 = perf_counter()
    cost = t1-t
    print(cost,'s')

未经允许不得转载:军哥驿站 » 爬取PPT模版

赞 (1) 打赏

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏