From e681c5d35386a7d4f50aabe0114cb288cd6807bb Mon Sep 17 00:00:00 2001 From: taocong45644 Date: Thu, 30 Apr 2026 17:32:32 +0800 Subject: [PATCH] =?UTF-8?q?docs:=20=E6=B7=BB=E5=8A=A0=E5=B0=8F=E8=AF=B4?= =?UTF-8?q?=E9=98=85=E8=AF=BB=E5=99=A8=E5=8A=9F=E8=83=BD=E8=AF=B4=E6=98=8E?= =?UTF-8?q?=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加详细的功能说明文档,包含项目概述、实现功能、技术实现、依赖安装和使用说明等内容 --- book_reader/功能说明.md | 157 ++++++ book_reader/小说阅读器.py | 986 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 1143 insertions(+) create mode 100644 book_reader/功能说明.md create mode 100644 book_reader/小说阅读器.py diff --git a/book_reader/功能说明.md b/book_reader/功能说明.md new file mode 100644 index 0000000..d5b237e --- /dev/null +++ b/book_reader/功能说明.md @@ -0,0 +1,157 @@ +# 小说阅读器功能说明 + +## 项目概述 + +本项目是一个基于Python Tkinter的小说阅读器应用,支持从指定网址爬取小说内容并生成PDF文件,同时提供PDF阅读功能。 + +## 实现功能 + +### 1. 小说爬取功能 + +- **网址输入**:支持输入小说章节列表页网址 +- **章节解析**:自动解析网页中的章节列表 +- **多线程爬取**:使用ThreadPoolExecutor并发爬取章节内容,提高速度 +- **内容清理**:自动过滤广告内容和无关文本 + +### 2. PDF生成功能 + +- **中文支持**:使用宋体字体确保中文正常显示 +- **书签生成**:自动为每个章节添加PDF书签 +- **页面布局**:合理的页边距和行距设置 +- **页码显示**:每页底部显示页码和书名 + +### 3. PDF阅读功能 + +- **内容显示**:使用PyMuPDF渲染PDF页面 +- **书签导航**:左侧显示PDF书签,点击可跳转到对应页面 +- **左右分栏布局**:书签和内容并排显示 +- **可调整大小**:支持拖动分隔线调整左右面板比例 +- **滚动浏览**:支持鼠标滚轮和滚动条浏览内容 + +### 4. 文本阅读模式 + +- **章节列表**:左侧显示所有章节标题 +- **内容显示**:右侧显示章节文本内容 +- **章节切换**:点击章节列表切换章节 + +### 5. 搜索小说功能 + +- **关键词搜索**:输入小说名称进行搜索 +- **结果展示**:显示搜索结果列表 +- **一键选择**:选择结果自动填充网址和名称 + +## 技术实现 + +### 核心技术栈 + +| 模块 | 技术 | 说明 | +|------|------|------| +| GUI框架 | Tkinter | Python内置图形界面库 | +| 网络请求 | requests | HTTP请求库 | +| HTML解析 | BeautifulSoup | HTML解析库 | +| PDF生成 | ReportLab | PDF生成库 | +| PDF处理 | PyMuPDF (fitz) | PDF读取和书签处理 | +| 多线程 | concurrent.futures | 并发爬取支持 | + +### 关键特性 + +1. **多线程爬取**:最多10个线程同时爬取,大幅提升速度 +2. **连接池优化**:使用HTTPAdapter设置连接池,减少连接开销 +3. **状态管理**:支持爬取过程中的状态监控和停止操作 +4. **异常处理**:完善的异常捕获和错误提示 + +## 依赖与安装 + +### 依赖库列表 + +| 库名称 | 版本 | 用途 | +|--------|------|------| +| requests | >=2.31.0 | HTTP网络请求 | +| beautifulsoup4 | >=4.12.0 | HTML解析 | +| reportlab | >=4.0.0 | PDF生成 | +| pymupdf | >=1.23.0 | PDF读取和渲染 | +| pypdf | >=4.0.0 | PDF书签处理 | +| lxml | >=5.1.0 | HTML解析支持 | + +### 安装方法 + +#### 方法一:使用pip逐一安装 + +```bash +pip install requests +pip install beautifulsoup4 +pip install reportlab +pip install pymupdf +pip install pypdf +pip install lxml +``` + +#### 方法二:使用requirements.txt + +创建 `requirements.txt` 文件,内容如下: + +```txt +requests>=2.31.0 +beautifulsoup4>=4.12.0 +reportlab>=4.0.0 +pymupdf>=1.23.0 +pypdf>=4.0.0 +lxml>=5.1.0 +``` + +然后执行安装: + +```bash +pip install -r requirements.txt +``` + +### 验证安装 + +安装完成后,可以通过以下命令验证: + +```bash +python -c "import requests; import bs4; import reportlab; import fitz; import pypdf; print('所有依赖安装成功')" +``` + +### 注意事项 + +1. **ReportLab字体**:如果生成PDF时中文显示为方框,需要确保系统中有宋体字体 +2. **PyMuPDF版本**:建议使用最新版本以获得更好的兼容性 +3. **网络环境**:爬取功能需要网络连接,部分网站可能需要代理 + +## 使用说明 + +### 爬取小说 + +1. 在"网址"输入框中输入小说章节列表页网址 +2. 在"小说名称"输入框中输入小说名称 +3. 点击"开始爬取"按钮 +4. 等待爬取完成,自动生成PDF文件 + +### 阅读PDF + +1. 点击"打开PDF"按钮选择PDF文件 +2. 左侧显示书签列表(如有) +3. 点击书签跳转到对应页面 +4. 拖动中间分隔线调整左右面板大小 + +### 切换阅读模式 + +- **文本阅读**:查看爬取的文本内容 +- **PDF阅读**:查看生成的PDF文件 + +## 输出文件 + +爬取完成后,PDF文件保存在项目目录下的 `download` 文件夹中,文件名与输入的小说名称一致。 + +## 注意事项 + +1. 爬取速度受网络状况和目标网站限制 +2. 建议使用合法合规的小说网站进行爬取 +3. PDF生成需要安装ReportLab库 +4. PDF书签功能需要安装PyPDF库 + +--- + +*项目版本:1.0* +*最后更新:2026年4月* diff --git a/book_reader/小说阅读器.py b/book_reader/小说阅读器.py new file mode 100644 index 0000000..c609745 --- /dev/null +++ b/book_reader/小说阅读器.py @@ -0,0 +1,986 @@ +import tkinter as tk +from tkinter import ttk, messagebox, filedialog +import threading +import os +import sys +import requests +from bs4 import BeautifulSoup +import fitz + +class NovelSpider: + def __init__(self): + self.session = requests.Session() + adapter = requests.adapters.HTTPAdapter( + pool_connections=20, + pool_maxsize=20, + max_retries=2 + ) + self.session.mount('http://', adapter) + self.session.mount('https://', adapter) + self.session.verify = False + self.session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', + 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', + 'Referer': 'https://www.92yanqing.com/', + 'Connection': 'keep-alive' + }) + self.timeout = 8 + + def get_chapters(self, book_url): + try: + response = self.session.get(book_url, timeout=self.timeout) + response.encoding = "utf-8" + soup = BeautifulSoup(response.text, 'html.parser') + + chapters = [] + + chapter_list = soup.find('div', class_='chapterlist') + if not chapter_list: + chapter_list = soup.find('div', class_='listmain') + if not chapter_list: + chapter_list = soup.find('div', id='list') + if not chapter_list: + chapter_list = soup.find('ul', class_='chapterlist') + if not chapter_list: + chapter_list = soup.find('div', class_='chapter') + + if chapter_list: + links = chapter_list.find_all('a', href=True) + for link in links: + href = link.get('href') + title = link.get_text(strip=True) + if href and title and '/read/' in href: + if not href.startswith('http'): + if href.startswith('/'): + href = "https://www.92yanqing.com" + href + else: + href = book_url.rstrip('/') + '/' + href + chapters.append((title, href, book_url)) + + if len(chapters) < 10: + start_read_link = soup.find('a', text='开始阅读') + if start_read_link: + start_url = start_read_link.get('href') + if not start_url.startswith('http'): + if start_url.startswith('/'): + start_url = "https://www.92yanqing.com" + start_url + else: + start_url = book_url.rstrip('/') + '/' + start_url + + try: + response = self.session.get(start_url) + response.encoding = "utf-8" + soup = BeautifulSoup(response.text, 'html.parser') + + chapter_select = soup.find('select') + if chapter_select: + options = chapter_select.find_all('option') + for option in options: + value = option.get('value') + title = option.get_text(strip=True) + if value and title and value != '#': + if not value.startswith('http'): + if value.startswith('/'): + value = "https://www.92yanqing.com" + value + else: + value = book_url.rstrip('/') + '/' + value + chapters.append((title, value, book_url)) + except Exception as e: + print(f"尝试从开始阅读页面获取章节失败: {e}") + + chapters.sort(key=lambda x: x[0]) + + print(f"获取到 {len(chapters)} 章") + return chapters + except Exception as e: + print(f"获取章节失败: {e}") + return [] + + def get_content(self, chapter_info): + title, url, book_url = chapter_info + content = "" + has_chapter_end = False + + try: + while url: + response = self.session.get(url, timeout=self.timeout) + response.encoding = "utf-8" + soup = BeautifulSoup(response.text, 'html.parser') + + content_div = soup.find('div', id='content') or soup.find('div', class_='content') + + if content_div: + for script in content_div(['script', 'style']): + script.decompose() + + text = content_div.get_text(separator='\n', strip=True) + has_chapter_end = '本章完' in text + text = text.replace('本章未完,点击下一页继续阅读', '') + text = text.replace('本章完', '') + text = text.replace('请记住本书首发域名:www.92yanqing.com。', '') + text = text.replace('92言情小说网', '') + text = text.replace('最快更新无弹窗小说', '') + + lines = text.split('\n') + cleaned_lines = [] + for line in lines: + line = line.strip() + if line and line != title: + cleaned_lines.append(line) + + content += '\n'.join(cleaned_lines) + '\n\n' + + next_page = None + next_link = soup.find('a', text='下一页') or soup.find('a', text='下一章') + if next_link: + next_page = next_link.get('href') + if not next_page.startswith('http'): + if next_page.startswith('/'): + next_page = "https://www.92yanqing.com" + next_page + else: + next_page = book_url.rstrip('/') + '/' + next_page + + if has_chapter_end or not next_page: + break + + url = next_page + + return content.strip() + except Exception as e: + print(f"获取内容失败: {e}") + return content + +class PDFViewer(tk.Frame): + def __init__(self, parent): + super().__init__(parent) + self.parent = parent + self.pdf_document = None + self.current_page = 0 + self.total_pages = 0 + self.page_images = {} + self.outline = [] + + self.paned_window = ttk.PanedWindow(self, orient=tk.HORIZONTAL) + self.paned_window.pack(fill=tk.BOTH, expand=True) + + self.outline_frame = ttk.Frame(self, width=200, borderwidth=1, relief="solid") + self.outline_frame.pack_propagate(False) + + self.outline_tree = ttk.Treeview(self.outline_frame, show='tree') + self.outline_tree.pack(fill=tk.BOTH, expand=True) + self.outline_tree.bind('<>', self.on_outline_select) + self.outline_tree.insert("", tk.END, text="请打开PDF文件") + + self.content_frame = ttk.Frame(self) + self.content_frame.pack_propagate(False) + + self.canvas = tk.Canvas(self.content_frame, bg='white') + self.canvas.pack(fill=tk.BOTH, expand=True) + + self.scrollbar_y = ttk.Scrollbar(self.content_frame, orient=tk.VERTICAL, command=self.canvas.yview) + self.scrollbar_y.pack(side=tk.RIGHT, fill=tk.Y) + self.canvas.config(yscrollcommand=self.scrollbar_y.set) + + self.scrollbar_x = ttk.Scrollbar(self.content_frame, orient=tk.HORIZONTAL, command=self.canvas.xview) + self.scrollbar_x.pack(side=tk.BOTTOM, fill=tk.X) + self.canvas.config(xscrollcommand=self.scrollbar_x.set) + + self.canvas.bind('', self.on_mousewheel) + self.canvas.bind('', self.on_mousewheel) + self.canvas.bind('', self.on_mousewheel) + + self.status_label = ttk.Label(self, text="") + self.status_label.pack(side=tk.BOTTOM, fill=tk.X) + + def load_pdf(self, file_path): + try: + print(f"开始加载PDF: {file_path}") + self.pdf_document = fitz.open(file_path) + self.total_pages = len(self.pdf_document) + self.current_page = 0 + self.page_images = {} + print(f"PDF加载成功,共 {self.total_pages} 页") + + self.load_outline() + + self.show_page(0) + self.update_status() + return True + except Exception as e: + print(f"加载PDF失败: {e}") + messagebox.showerror("错误", f"加载PDF失败: {str(e)}") + return False + + def load_outline(self): + for item in self.outline_tree.get_children(): + self.outline_tree.delete(item) + + self.outline = [] + try: + toc = self.pdf_document.get_toc() + print(f"获取到书签数量: {len(toc) if toc else 0}") + + if toc: + parent_map = {0: ""} + count = 0 + for i, entry in enumerate(toc[:5]): + print(f"书签 {i}: {entry}") + + for entry in toc: + level, title, page_num = entry + if level not in parent_map: + parent_map[level] = parent_map.get(level - 1, "") + + parent = parent_map.get(level - 1, "") + try: + item_id = self.outline_tree.insert(parent, tk.END, text=title, values=(page_num - 1,)) + parent_map[level] = item_id + self.outline.append((title, page_num - 1)) + count += 1 + except Exception as insert_e: + print(f"插入书签失败 '{title}': {insert_e}") + + print(f"已加载 {count} 个书签") + print(f"Treeview子节点数量: {len(self.outline_tree.get_children())}") + else: + self.outline_tree.insert("", tk.END, text="该PDF没有书签") + print("PDF没有书签") + + except Exception as e: + self.outline_tree.insert("", tk.END, text="加载书签失败") + print(f"加载书签失败: {e}") + + panes = list(self.paned_window.panes()) + for pane in panes: + self.paned_window.forget(pane) + + self.paned_window.add(self.outline_frame, weight=1) + self.paned_window.add(self.content_frame, weight=3) + + def on_outline_select(self, event): + selected = self.outline_tree.selection() + if selected: + item = selected[0] + page_num = int(self.outline_tree.item(item, "values")[0]) + self.show_page(page_num) + + def show_page(self, page_num): + if not self.pdf_document or page_num < 0 or page_num >= self.total_pages: + return + + self.current_page = page_num + + if page_num in self.page_images: + img = self.page_images[page_num] + else: + page = self.pdf_document.load_page(page_num) + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) + img = tk.PhotoImage(data=pix.tobytes('ppm')) + self.page_images[page_num] = img + + self.canvas.image = img + self.canvas.delete('all') + self.canvas.create_image(0, 0, anchor=tk.NW, image=img) + self.canvas.config(scrollregion=self.canvas.bbox(tk.ALL)) + self.canvas.yview_moveto(0) + self.canvas.xview_moveto(0) + self.update_status() + + def update_status(self): + if self.pdf_document: + self.status_label.config(text=f"第 {self.current_page + 1} / {self.total_pages} 页") + else: + self.status_label.config(text="") + + def on_mousewheel(self, event): + if event.num == 4 or event.delta > 0: + self.canvas.yview_scroll(-1, 'units') + else: + self.canvas.yview_scroll(1, 'units') + + def next_page(self): + if self.current_page < self.total_pages - 1: + self.show_page(self.current_page + 1) + + def prev_page(self): + if self.current_page > 0: + self.show_page(self.current_page - 1) + +class NovelReaderApp: + def __init__(self, root): + self.root = root + self.root.title("小说阅读器") + self.root.geometry("1200x800") + + self.spider = NovelSpider() + self.chapters = [] + self.current_chapter = 0 + self.content_cache = {} + self.novel_name = "" + + self.create_widgets() + + def create_widgets(self): + self.top_frame = ttk.Frame(self.root, padding="10") + self.top_frame.pack(fill=tk.X, side=tk.TOP) + + ttk.Label(self.top_frame, text="网址:").pack(side=tk.LEFT, padx=5) + self.url_entry = ttk.Entry(self.top_frame, width=50) + self.url_entry.pack(side=tk.LEFT, padx=5) + + ttk.Label(self.top_frame, text="小说名称:").pack(side=tk.LEFT, padx=5) + self.name_entry = ttk.Entry(self.top_frame, width=30) + self.name_entry.pack(side=tk.LEFT, padx=5) + + self.crawl_btn = ttk.Button(self.top_frame, text="开始爬取", command=self.start_crawl) + self.crawl_btn.pack(side=tk.LEFT, padx=5) + + self.search_btn = ttk.Button(self.top_frame, text="搜索小说", command=self.search_novel) + self.search_btn.pack(side=tk.LEFT, padx=5) + + self.open_pdf_btn = ttk.Button(self.top_frame, text="打开PDF", command=self.open_pdf_file) + self.open_pdf_btn.pack(side=tk.LEFT, padx=5) + + self.mode_var = tk.StringVar(value="text") + ttk.Radiobutton(self.top_frame, text="文本阅读", variable=self.mode_var, value="text", + command=self.switch_mode).pack(side=tk.LEFT, padx=5) + ttk.Radiobutton(self.top_frame, text="PDF阅读", variable=self.mode_var, value="pdf", + command=self.switch_mode).pack(side=tk.LEFT, padx=5) + + self.progress_bar = ttk.Progressbar(self.top_frame, mode='determinate', maximum=100, value=0) + + self.main_frame = ttk.Frame(self.root) + self.main_frame.pack(fill=tk.BOTH, expand=True) + + self.left_frame = ttk.Frame(self.main_frame, width=250, borderwidth=1, relief="solid") + self.left_frame.pack(fill=tk.Y, side=tk.LEFT) + self.left_frame.pack_propagate(False) + + self.chapter_tree = ttk.Treeview(self.left_frame, columns=('index',), show='tree') + self.chapter_tree.pack(fill=tk.BOTH, expand=True) + self.chapter_tree.bind('<>', self.on_chapter_select) + + self.chapter_tree.insert("", tk.END, text="请输入小说网址并爬取") + print("章节树已初始化,显示提示文本") + + self.text_frame = ttk.Frame(self.main_frame) + self.text_frame.pack(fill=tk.BOTH, expand=True) + + self.content_text = tk.Text(self.text_frame, wrap=tk.WORD, font=('SimSun', 12)) + self.content_text.pack(fill=tk.BOTH, expand=True) + + self.pdf_frame = PDFViewer(self.main_frame) + + self.bottom_frame = ttk.Frame(self.root, padding="10") + self.bottom_frame.pack(fill=tk.X, side=tk.BOTTOM) + + + + self.status_label = ttk.Label(self.bottom_frame, text="准备就绪") + self.status_label.pack(side=tk.RIGHT) + + self.current_pdf_path = "" + self.stop_crawl_flag = False + + def sort_chapters(self, chapters): + import re + + def extract_chapter_number(title): + match = re.search(r'第(\d+)章', title) + if match: + return int(match.group(1)) + match = re.search(r'(\d+)、', title) + if match: + return int(match.group(1)) + match = re.search(r'(\d+) ', title) + if match: + return int(match.group(1)) + return 0 + + sorted_chapters = sorted(chapters, key=lambda x: extract_chapter_number(x[0])) + return sorted_chapters + + def disable_inputs(self): + self.url_entry.config(state=tk.DISABLED) + self.name_entry.config(state=tk.DISABLED) + self.search_btn.config(state=tk.DISABLED) + self.open_pdf_btn.config(state=tk.DISABLED) + + def enable_inputs(self): + self.url_entry.config(state=tk.NORMAL) + self.name_entry.config(state=tk.NORMAL) + self.search_btn.config(state=tk.NORMAL) + self.open_pdf_btn.config(state=tk.NORMAL) + + def start_crawl(self): + if self.crawl_btn['text'] == '停止爬取': + self.stop_crawl_flag = True + self.crawl_btn.config(text='开始爬取') + self.status_label.config(text="爬取已停止") + self.enable_inputs() + return + + book_url = self.url_entry.get().strip() + self.novel_name = self.name_entry.get().strip() + + if not book_url: + messagebox.showwarning("警告", "请输入网址") + return + + if not self.novel_name: + messagebox.showwarning("警告", "请输入小说名称") + return + + self.stop_crawl_flag = False + self.content_cache = {} + self.chapters = [] + self.disable_inputs() + self.crawl_btn.config(text='停止爬取') + self.progress_bar.pack(side=tk.LEFT, padx=5, fill=tk.X, expand=True) + + self.start_crawl_thread() + + def search_novel(self): + novel_name = self.name_entry.get().strip() + + if not novel_name: + messagebox.showwarning("警告", "请输入小说名称") + return + + self.status_label.config(text=f"正在搜索《{novel_name}》...") + + def search_thread(): + try: + search_url = f"https://www.92yanqing.com/s/?searchkey={novel_name}" + response = self.spider.session.get(search_url, timeout=10) + response.encoding = "utf-8" + soup = BeautifulSoup(response.text, 'html.parser') + + results = [] + book_items = soup.find_all('div', class_='bookitem') or soup.find_all('div', class_='search-item') + + if not book_items: + book_items = soup.find_all('a', href=True) + + for item in book_items: + title = item.get_text(strip=True) + href = item.get('href', '') + if title and href and '/read/' in href: + if not href.startswith('http'): + href = "https://www.92yanqing.com" + href + if novel_name in title: + results.append((title, href)) + + if not results: + self.root.after(0, lambda: messagebox.showwarning("提示", f"未找到《{novel_name}》相关小说")) + return + + self.root.after(0, lambda: self.show_search_results(results)) + + except Exception as e: + print(f"搜索失败: {e}") + self.root.after(0, lambda: messagebox.showerror("错误", f"搜索失败: {str(e)}")) + + threading.Thread(target=search_thread, daemon=True).start() + + def show_search_results(self, results): + search_window = tk.Toplevel(self.root) + search_window.title("搜索结果") + search_window.geometry("800x500") + + tree = ttk.Treeview(search_window, columns=('url',), show='tree') + tree.pack(fill=tk.BOTH, expand=True) + + for title, url in results: + tree.insert("", tk.END, text=title, values=(url,)) + + def on_select(event): + selected = tree.selection() + if selected: + item = selected[0] + url = tree.item(item, "values")[0] + title = tree.item(item, "text") + self.url_entry.delete(0, tk.END) + self.url_entry.insert(0, url) + self.name_entry.delete(0, tk.END) + self.name_entry.insert(0, title) + search_window.destroy() + + tree.bind('<>', on_select) + + select_btn = ttk.Button(search_window, text="选择", command=lambda: on_select(None)) + select_btn.pack(pady=10) + + self.status_label.config(text="搜索完成") + + def start_crawl_thread(self): + book_url = self.url_entry.get().strip() + self.novel_name = self.name_entry.get().strip() + + def crawl_thread(): + try: + self.root.after(0, lambda: self.progress_bar.config(value=0)) + + self.root.after(0, lambda: self.status_label.config(text="正在获取章节列表...")) + print("开始获取章节列表") + self.chapters = self.spider.get_chapters(book_url) + print(f"获取章节完成,共 {len(self.chapters)} 章") + + if self.stop_crawl_flag: + return + + if not self.chapters: + self.root.after(0, lambda: messagebox.showwarning("警告", "未能获取章节列表")) + return + + self.root.after(0, lambda: self.progress_bar.config(value=5)) + self.root.after(0, lambda: self.status_label.config(text="正在排序章节...")) + self.chapters = self.sort_chapters(self.chapters) + print(f"章节排序完成,共 {len(self.chapters)} 章") + + self.root.after(0, lambda: self.progress_bar.config(value=8)) + print("准备更新章节树") + self.root.after(0, self.update_chapter_tree) + + self.root.after(0, lambda: self.status_label.config(text="正在生成PDF...")) + self.root.after(0, self.auto_generate_pdf) + + except Exception as e: + self.root.after(0, lambda: messagebox.showerror("错误", f"爬取失败: {str(e)}")) + finally: + self.root.after(0, self.crawl_complete) + + threading.Thread(target=crawl_thread, daemon=True).start() + + def update_chapter_tree(self): + print(f"update_chapter_tree 被调用,章节数: {len(self.chapters)}") + for item in self.chapter_tree.get_children(): + self.chapter_tree.delete(item) + + for i, (title, _, _) in enumerate(self.chapters): + self.chapter_tree.insert("", tk.END, text=title, values=(i,)) + print(f"章节树已更新,共 {len(self.chapters)} 章") + + self.crawl_btn.config(state=tk.NORMAL) + self.status_label.config(text=f"《{self.novel_name}》获取成功,共 {len(self.chapters)} 章") + + if self.chapters: + self.chapter_tree.selection_set(self.chapter_tree.get_children()[0]) + self.show_chapter(0) + + def auto_generate_pdf(self): + try: + from reportlab.pdfbase import pdfmetrics + from reportlab.pdfbase.ttfonts import TTFont + from reportlab.platypus import SimpleDocTemplate, Paragraph, PageBreak + from reportlab.lib.styles import ParagraphStyle + from reportlab.lib import colors + from reportlab.lib.pagesizes import A4 + from reportlab.lib.units import inch + except ImportError as e: + msg = f"需要安装reportlab才能生成PDF\n错误: {e}\n请运行: pip install reportlab" + self.status_label.config(text="需要安装reportlab") + messagebox.showwarning("提示", msg) + return + + self.status_label.config(text=f"正在多线程获取内容...") + + def pdf_thread(): + try: + from concurrent.futures import ThreadPoolExecutor, as_completed + import urllib3 + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + total = len(self.chapters) + max_workers = min(10, total) + + def fetch_content(args): + idx, chapter = args + try: + if self.stop_crawl_flag: + return idx, None + content = self.spider.get_content(chapter) + return idx, content + except Exception as e: + print(f"章节 {idx} 获取失败: {e}") + return idx, None + + self.root.after(0, lambda: self.status_label.config(text=f"多线程获取内容... (0/{total})")) + + print(f"开始多线程获取内容,共 {total} 章,使用 {max_workers} 个线程") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {} + for i, chapter in enumerate(self.chapters): + if self.stop_crawl_flag: + break + future = executor.submit(fetch_content, (i, chapter)) + futures[future] = i + print(f"已提交任务 {i+1}/{total}") + + completed = 0 + for future in as_completed(futures): + if self.stop_crawl_flag: + executor.shutdown(wait=False) + return + + idx, content = future.result() + if content: + self.content_cache[idx] = content + + completed += 1 + progress = 10 + int(completed / total * 75) + self.root.after(0, lambda p=progress: self.progress_bar.config(value=p)) + self.root.after(0, lambda p=progress, c=completed, t=total: self.status_label.config( + text=f"多线程获取内容... {p}% ({c}/{t})")) + + download_dir = os.path.join(os.getcwd(), "download") + if not os.path.exists(download_dir): + os.makedirs(download_dir) + + font_path = None + possible_font_paths = [ + 'SimSun.ttf', + 'simsun.ttc', + os.path.join('C:', 'Windows', 'Fonts', 'simsun.ttc'), + os.path.join('C:', 'Windows', 'Fonts', 'SimSun.ttf'), + os.path.join(os.environ.get('WINDIR', 'C:\\Windows'), 'Fonts', 'simsun.ttc'), + os.path.join(os.environ.get('WINDIR', 'C:\\Windows'), 'Fonts', 'SimSun.ttf') + ] + + for path in possible_font_paths: + if os.path.exists(path): + font_path = path + break + + if font_path: + pdfmetrics.registerFont(TTFont('SimSun', font_path)) + else: + messagebox.showwarning("警告", "未找到中文字体,可能影响PDF生成") + + self.current_pdf_path = os.path.join(download_dir, f"{self.novel_name}.pdf") + + from reportlab.pdfgen import canvas + + doc = SimpleDocTemplate(self.current_pdf_path, pagesize=A4, leftMargin=50, rightMargin=50, + topMargin=50, bottomMargin=50) + + title_style = ParagraphStyle("BookTitle", fontSize=20, alignment=1, + spaceAfter=30, fontName="SimSun") + chap_style = ParagraphStyle("ChapTitle", fontSize=14, spaceBefore=20, + spaceAfter=10, fontName="SimSun", textColor=colors.darkblue) + txt_style = ParagraphStyle("Content", fontSize=11, leading=18, + spaceAfter=6, fontName="SimSun") + + story = [] + story.append(Paragraph(self.novel_name, title_style)) + story.append(PageBreak()) + + self.root.after(0, lambda: self.status_label.config(text="正在生成PDF...")) + + for i, chapter_info in enumerate(self.chapters): + if self.stop_crawl_flag: + return + + title, _, _ = chapter_info + + if i in self.content_cache: + content = self.content_cache[i] + else: + content = self.spider.get_content(chapter_info) + self.content_cache[i] = content + + chapter_para = Paragraph(title, chap_style) + chapter_para._bookmarkName = title + chapter_para._bookmarkLevel = 0 + story.append(chapter_para) + + if content: + lines = content.split("\n") + for line in lines: + line = line.strip() + if line and line != title and title not in line: + story.append(Paragraph(line, txt_style)) + story.append(PageBreak()) + + progress = 85 + int((i+1)/len(self.chapters) * 10) + self.root.after(0, lambda p=progress: self.progress_bar.config(value=p)) + self.root.after(0, lambda p=progress: self.status_label.config(text=f"正在生成PDF... {p}%")) + + def onFirstPage(canvas, doc): + canvas.saveState() + canvas.setFont('SimSun', 9) + canvas.drawString(inch, 0.75 * inch, f"《{self.novel_name}》") + canvas.restoreState() + + def onLaterPages(canvas, doc): + canvas.saveState() + canvas.setFont('SimSun', 9) + canvas.drawString(inch, 0.75 * inch, f"第 {doc.page} 页") + canvas.drawRightString(doc.pagesize[0] - inch, 0.75 * inch, f"《{self.novel_name}》") + canvas.restoreState() + + print("开始构建PDF文档...") + + def add_bookmarks(canvas, doc): + for element in story: + if hasattr(element, '_bookmarkName'): + canvas.bookmarkPage(element._bookmarkName) + canvas.addOutlineEntry(element._bookmarkName, element._bookmarkName, + element._bookmarkLevel, 0) + + doc.build(story, onFirstPage=onFirstPage, onLaterPages=onLaterPages) + print("PDF文档构建完成") + + self.root.after(0, lambda: self.progress_bar.config(value=97)) + self.root.after(0, lambda: self.status_label.config(text="正在添加书签...")) + + print("开始添加书签...") + self.add_pdf_bookmarks_simple(self.current_pdf_path, self.chapters) + print("书签添加完成") + + self.root.after(0, lambda: self.progress_bar.config(value=100)) + self.root.after(0, lambda: self.status_label.config(text=f"PDF生成完成!文件已保存")) + self.root.after(0, lambda: messagebox.showinfo("成功", f"PDF生成完成!\n文件位置: {self.current_pdf_path}")) + + except Exception as e: + print(f"PDF生成失败: {e}") + self.root.after(0, lambda: messagebox.showerror("错误", f"PDF生成失败: {str(e)}")) + + threading.Thread(target=pdf_thread, daemon=True).start() + + def add_pdf_bookmarks(self, pdf_path, toc_entries): + try: + doc = fitz.open(pdf_path) + print(f"PDF共 {doc.page_count} 页,需要添加 {len(toc_entries)} 个书签") + + page_titles = {} + for page_num in range(doc.page_count): + page = doc.load_page(page_num) + text = page.get_text() + if page_num < 3: + print(f"第 {page_num+1} 页前500字符: {text[:500]}") + for title, estimated_page in toc_entries: + if title in text and title not in page_titles: + page_titles[title] = page_num + break + + print(f"找到 {len(page_titles)} 个章节标题") + + added_count = 0 + for title, page_num in toc_entries: + if title in page_titles: + actual_page = page_titles[title] + else: + actual_page = max(0, page_num - 1) + + if actual_page < doc.page_count: + doc.insert_toc_item(title, actual_page + 1) + added_count += 1 + + print(f"已添加 {added_count} 个书签") + + temp_path = pdf_path + ".tmp" + doc.save(temp_path) + doc.close() + + import shutil + shutil.move(temp_path, pdf_path) + + print(f"书签添加成功") + except Exception as e: + print(f"添加书签失败: {e}") + + def add_pdf_bookmarks_simple(self, pdf_path, chapters): + try: + from pypdf import PdfReader, PdfWriter + + reader = PdfReader(pdf_path) + writer = PdfWriter() + + for page in reader.pages: + writer.add_page(page) + + total_pages = len(reader.pages) + print(f"PDF共有 {total_pages} 页,开始添加书签...") + + last_found_page = 1 + + for idx, (title, _, _) in enumerate(chapters): + found = False + for page_idx in range(last_found_page, min(last_found_page + 5, total_pages)): + text = reader.pages[page_idx].extract_text() + if text and title in text: + try: + writer.add_outline_item(title, page_idx, parent=None) + print(f"为第{idx+1}章 '{title}' 添加书签到第{page_idx+1}页") + last_found_page = page_idx + 1 + found = True + except Exception as e: + pass + break + + if not found: + for page_idx in range(last_found_page, total_pages): + text = reader.pages[page_idx].extract_text() + if text and title in text: + try: + writer.add_outline_item(title, page_idx, parent=None) + print(f"为第{idx+1}章 '{title}' 添加书签到第{page_idx+1}页(跨页查找)") + last_found_page = page_idx + 1 + except Exception as e: + pass + break + + output_path = pdf_path + "_with_bookmarks.pdf" + with open(output_path, "wb") as f: + writer.write(f) + + import shutil + shutil.move(output_path, pdf_path) + + print("书签添加完成!") + except Exception as e: + print(f"添加书签失败: {e}") + + def find_chapter_page(self, doc, chapter_title, estimated_page): + search_start = max(0, estimated_page - 2) + search_end = min(doc.page_count, estimated_page + 2) + + for page_num in range(search_start, search_end): + page = doc.load_page(page_num) + text = page.get_text() + if chapter_title in text: + return page_num + + try: + for page_num in range(doc.page_count): + page = doc.load_page(page_num) + text = page.get_text() + if chapter_title in text: + return page_num + except Exception as e: + print(f"搜索章节页面失败: {e}") + + return estimated_page - 1 if estimated_page > 0 else 0 + + def crawl_complete(self): + self.progress_bar.stop() + self.progress_bar.pack_forget() + if self.crawl_btn['text'] == '停止爬取': + self.crawl_btn.config(text='开始爬取') + self.enable_inputs() + + def on_chapter_select(self, event): + selected = self.chapter_tree.selection() + if not selected: + return + + item = selected[0] + index = int(self.chapter_tree.item(item, "values")[0]) + self.current_chapter = index + + if self.mode_var.get() == "text": + self.show_chapter(index) + else: + self.show_pdf_chapter(index) + + def show_chapter(self, index): + if index < 0 or index >= len(self.chapters): + return + + self.current_chapter = index + title, _, _ = self.chapters[index] + + if index in self.content_cache: + content = self.content_cache[index] + self.status_label.config(text=f"第 {index+1}/{len(self.chapters)} 章:{title}") + self.display_content(title, content) + return + + self.status_label.config(text=f"正在加载第 {index+1} 章:{title}") + + def load_content(): + content = self.spider.get_content(self.chapters[index]) + self.content_cache[index] = content + self.root.after(0, lambda: self.display_content(title, content)) + self.root.after(0, lambda: self.status_label.config(text=f"第 {index+1}/{len(self.chapters)} 章:{title}")) + + threading.Thread(target=load_content, daemon=True).start() + + def show_pdf_chapter(self, index): + if index < 0 or index >= len(self.chapters): + return + + self.current_chapter = index + title, _, _ = self.chapters[index] + + if self.pdf_frame.pdf_document: + doc = self.pdf_frame.pdf_document + for page_num in range(doc.page_count): + page = doc.load_page(page_num) + text = page.get_text() + if title in text: + self.pdf_frame.show_page(page_num) + self.status_label.config(text=f"第 {index+1}/{len(self.chapters)} 章:{title}") + return + + self.status_label.config(text=f"未找到章节 '{title}'") + + def display_content(self, title, content): + self.content_text.delete(1.0, tk.END) + self.content_text.insert(tk.END, f"{title}\n\n") + self.content_text.insert(tk.END, content) + self.content_text.config(state=tk.DISABLED) + + def prev_chapter(self): + if self.mode_var.get() == "text": + if self.current_chapter > 0: + self.current_chapter -= 1 + self.chapter_tree.selection_set(self.chapter_tree.get_children()[self.current_chapter]) + self.show_chapter(self.current_chapter) + else: + if self.current_chapter > 0: + self.current_chapter -= 1 + self.chapter_tree.selection_set(self.chapter_tree.get_children()[self.current_chapter]) + self.show_pdf_chapter(self.current_chapter) + + def next_chapter(self): + if self.mode_var.get() == "text": + if self.current_chapter < len(self.chapters) - 1: + self.current_chapter += 1 + self.chapter_tree.selection_set(self.chapter_tree.get_children()[self.current_chapter]) + self.show_chapter(self.current_chapter) + else: + if self.current_chapter < len(self.chapters) - 1: + self.current_chapter += 1 + self.chapter_tree.selection_set(self.chapter_tree.get_children()[self.current_chapter]) + self.show_pdf_chapter(self.current_chapter) + + def switch_mode(self): + mode = self.mode_var.get() + if mode == "text": + self.pdf_frame.pack_forget() + self.text_frame.pack(fill=tk.BOTH, expand=True) + self.left_frame.pack(fill=tk.Y, side=tk.LEFT) + if self.chapters: + self.show_chapter(self.current_chapter) + else: + self.text_frame.pack_forget() + self.left_frame.pack_forget() + self.pdf_frame.pack(fill=tk.BOTH, expand=True) + if self.current_pdf_path and os.path.exists(self.current_pdf_path): + self.pdf_frame.load_pdf(self.current_pdf_path) + + def open_pdf_file(self): + file_path = filedialog.askopenfilename(filetypes=[("PDF文件", "*.pdf")]) + if file_path: + self.current_pdf_path = file_path + self.mode_var.set("pdf") + self.switch_mode() + +if __name__ == "__main__": + root = tk.Tk() + app = NovelReaderApp(root) + root.mainloop() \ No newline at end of file