#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests,time,re
from lxml import etree
from decimal import Decimal
from unicodedata import normalize
from multiprocessing import Pool,cpu_count
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0',
'Referer':'https://www.vodtw.com/',
}
def getHtml(url,headers=headers):
r = requests.get(url,headers=headers,proxies=None)
return r.content
def useXpath(html):
html = etree.HTML(html)
urls = html.xpath("//dl/dd/a/@href")
return urls
def getContent(url):
url = 'https://www.vodtw.com/book/516/' + url
html = getHtml(url)
# 每个章节只显示一部分,另外一部分通过xhr异步请求。
reg_p_bufUrl = "bufurl='(.*?)'"
matches = re.findall(reg_p_bufUrl, html.decode('utf-8'), re.MULTILINE)
nextUrl = 'https://www.vodtw.com' + matches[0]
html = etree.HTML(html)
p = html.xpath("//*[@id='content']/text()")
title = html.xpath("//h1/text()")
print(f"正在抓取 {url} {title[1]}")
content = ''
for item in p:
content += normalize("NFKD",item) + '\n'
temp = getHtml(nextUrl).decode('utf-8').replace('<br><br>','\n')
content = content[:-1] + temp
# print(content)
return {"title":title[1],"content":content,"url":url}
if __name__ == "__main__":
start = time.time()
url = 'https://www.vodtw.com/book/516/'
html = getHtml(url)
mulu = useXpath(html)
filename = '寒门枭士.txt'
cpus = cpu_count()
pool = Pool(cpus)
res_list = []
i = 0
allcontents = []
for url in mulu[18:]:
i =i+1
res = pool.apply_async(func=getContent, args=(url,))
res_list.append(res)
if i %10 == 0:
# 每10章写入文件一次。
for res in res_list:
content = res.get()
if content:
allcontents.append(content)
for x in allcontents:
with open(filename,'a',encoding='utf-8') as f:
f.write(x["title"] + '\n')
f.write(x["content"] + '\n')
allcontents.clear()
res_list.clear()
time.sleep(3)
# 进程池关闭
pool.close()
# 等待所有进程结束
pool.join()
for x in allcontents:
with open(filename,'a',encoding='utf-8') as f:
f.write(x["title"] + '\n')
f.write(x["content"] + '\n')
print(f"抓取完毕,总共耗时:{Decimal((time.time()-start)//60).quantize(Decimal('0'))}分{Decimal((time.time()-start)%60).quantize(Decimal('0.00'))}秒")