#!/usr/bin/python
# -*- coding: utf-8 -*-

import requests,time,re
from lxml import etree
from decimal import Decimal
from unicodedata import normalize
from multiprocessing import Pool,cpu_count

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0',
        'Referer':'https://www.vodtw.com/',
    }

def getHtml(url,headers=headers):
    r = requests.get(url,headers=headers,proxies=None)
    return r.content

def useXpath(html):    
    html = etree.HTML(html)
    urls = html.xpath("//dl/dd/a/@href")
    return urls

def getContent(url):
    url = 'https://www.vodtw.com/book/516/' + url
    html = getHtml(url)

    # 每个章节只显示一部分,另外一部分通过xhr异步请求。
    reg_p_bufUrl = "bufurl='(.*?)'"
    matches = re.findall(reg_p_bufUrl, html.decode('utf-8'), re.MULTILINE)
    nextUrl = 'https://www.vodtw.com' + matches[0]

    html = etree.HTML(html)
    p = html.xpath("//*[@id='content']/text()")
    title = html.xpath("//h1/text()")
    print(f"正在抓取 {url} {title[1]}")
    content = ''
    for item in p:
        content += normalize("NFKD",item) + '\n'

    temp = getHtml(nextUrl).decode('utf-8').replace('<br><br>','\n')
    content = content[:-1] + temp
    # print(content)
    return {"title":title[1],"content":content,"url":url}

if __name__ == "__main__":
    start = time.time()
    url = 'https://www.vodtw.com/book/516/'
    html = getHtml(url)
    mulu = useXpath(html)
    filename = '寒门枭士.txt'
    cpus = cpu_count()
    pool = Pool(cpus)
    res_list = []
    i = 0
    allcontents = []
    for url in mulu[18:]:
        i =i+1
        res = pool.apply_async(func=getContent, args=(url,))
        res_list.append(res)

        if i %10 == 0:            
            # 每10章写入文件一次。
            for res in res_list:
                content = res.get()
                if content:
                    allcontents.append(content)
            
            for x in allcontents:
                with open(filename,'a',encoding='utf-8') as f:
                    f.write(x["title"] + '\n')
                    f.write(x["content"] + '\n')
            allcontents.clear()
            res_list.clear()
            time.sleep(3)
    # 进程池关闭
    pool.close()

    # 等待所有进程结束
    pool.join()

    for x in allcontents:
        with open(filename,'a',encoding='utf-8') as f:
            f.write(x["title"] + '\n')
            f.write(x["content"] + '\n')  

    print(f"抓取完毕,总共耗时:{Decimal((time.time()-start)//60).quantize(Decimal('0'))}分{Decimal((time.time()-start)%60).quantize(Decimal('0.00'))}秒")

标签: python

评论已关闭