小说爬虫

昨天有个老哥分享了8000本小说,可惜是在线看的,刚好最近在学python,就来练练手基于python3.9,最简单的requests库,同步爬取,异步爬取估计网站会崩

import pandas as pd

import requests

from lxml import etree

headers = {

“authority”: “www.sinodan.cc”,

“method”: “GET”,

“path”: “/txt/xiuxianzai.html”,

“scheme”: “https”,

“user-agent”: “Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.81″

}

# 创建小说

def creat_novel(novel_name):

file = open(‘{}.txt’.format(novel_name), mode=’a’, encoding=’utf-8′)

file.close()

# 保存章节

def se_chapter(novel_name, chapter_name, chapter):

file = open(‘{}.txt’.format(novel_name), mode=’a’, encoding=’utf-8′)

file.write(chapter_name + ‘
‘)

file.write(chapter + ‘
‘)

# 关闭文件

file.close()

# 获取章节内容

def get_chapter(novel_name, chapter_name, url):

response = requests.get(url, headers=headers)

response.encoding = ‘GBK’

html = etree.HTML(response.text)

chapter = html.xpath(‘normalize-space(//div[@class=”box_box”]/p)’)

print(chapter.replace(‘。 ‘, ‘。

‘).replace(‘」 ‘, ‘」

‘)

.replace(‘ ‘, ”).replace(‘***’, ”)

.replace(‘ ’, ”).replace(‘ ‘, ”))

# 获取每个章节阅读地址

def get_readURL(novel_name, url):

baseUrl = ‘https://www.sinodan.cc{}’

response = requests.get(url, headers=headers)

html = etree.HTML(response.text)

urls = html.xpath(“//div[@class=’list_box’]/ul/li/a/@href”)

chapters = html.xpath(“//div[@class=’list_box’]/ul/li/a/text()”)

for chapter, url in zip(chapters, urls):

print(‘##正在获取{}-{}:{}’.format(novel_name, chapter, baseUrl.format(url)))

get_chapter(novel_name, chapter, baseUrl.format(url))

def main():

# 读取Excel文件

table = pd.read_excel(‘Excel目录地址/小说目录.xlsx’, dtype={‘书名’: ‘str’})

# 书名排序

table.sort_values(by=’书名’, inplace=True)

booksName = table[‘书名’]

booksURL = table[‘阅读链接’]

for name, url in zip(booksName, booksURL):

print(‘#开始{}:{}’.format(name, url))

# 创建小说txt

creat_novel(name)

# 获取阅读地址

get_readURL(name, url.replace(‘/txt’, ”).removesuffix(‘.html’))

if __name__ == “__main__”:

main()

复制代码需要修改代码中main方法的Excel目录地址,现在主要存在文本排版问题,那位大佬可以帮忙优化一下

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注