小说爬虫
昨天有个老哥分享了8000本小说,可惜是在线看的,刚好最近在学python,就来练练手基于python3.9,最简单的requests库,同步爬取,异步爬取估计网站会崩
import pandas as pd
import requests
from lxml import etree
headers = {
“authority”: “www.sinodan.cc”,
“method”: “GET”,
“path”: “/txt/xiuxianzai.html”,
“scheme”: “https”,
“user-agent”: “Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81″
}
# 创建小说
def creat_novel(novel_name):
file = open(‘{}.txt’.format(novel_name), mode=’a’, encoding=’utf-8′)
file.close()
# 保存章节
def se_chapter(novel_name, chapter_name, chapter):
file = open(‘{}.txt’.format(novel_name), mode=’a’, encoding=’utf-8′)
file.write(chapter_name + ‘
‘)
file.write(chapter + ‘
‘)
# 关闭文件
file.close()
# 获取章节内容
def get_chapter(novel_name, chapter_name, url):
response = requests.get(url, headers=headers)
response.encoding = ‘GBK’
html = etree.HTML(response.text)
chapter = html.xpath(‘normalize-space(//div[@class=”box_box”]/p)’)
print(chapter.replace(‘。 ‘, ‘。
‘).replace(‘」 ‘, ‘」
‘)
.replace(‘ ‘, ”).replace(‘***’, ”)
.replace(‘ ’, ”).replace(‘ ‘, ”))
# 获取每个章节阅读地址
def get_readURL(novel_name, url):
baseUrl = ‘https://www.sinodan.cc{}’
response = requests.get(url, headers=headers)
html = etree.HTML(response.text)
urls = html.xpath(“//div[@class=’list_box’]/ul/li/a/@href”)
chapters = html.xpath(“//div[@class=’list_box’]/ul/li/a/text()”)
for chapter, url in zip(chapters, urls):
print(‘##正在获取{}-{}:{}’.format(novel_name, chapter, baseUrl.format(url)))
get_chapter(novel_name, chapter, baseUrl.format(url))
def main():
# 读取Excel文件
table = pd.read_excel(‘Excel目录地址/小说目录.xlsx’, dtype={‘书名’: ‘str’})
# 书名排序
table.sort_values(by=’书名’, inplace=True)
booksName = table[‘书名’]
booksURL = table[‘阅读链接’]
for name, url in zip(booksName, booksURL):
print(‘#开始{}:{}’.format(name, url))
# 创建小说txt
creat_novel(name)
# 获取阅读地址
get_readURL(name, url.replace(‘/txt’, ”).removesuffix(‘.html’))
if __name__ == “__main__”:
main()
复制代码需要修改代码中main方法的Excel目录地址,现在主要存在文本排版问题,那位大佬可以帮忙优化一下