import asyncio import json import re from playwright.async_api import async_playwright CTEXT_INDEX_URL = "https://ctext.org/wiki.pl?if=gb&res=87723&remap=gb" async def scrape_ctext(): results = {} # 启动Playwright(必须使用非无头模式以绕过反爬) async with async_playwright() as p: browser = await p.chromium.launch(headless=False) context = await browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", viewport={"width": 1280, "height": 800} ) page = await context.new_page() print(f"正在访问目录页: {CTEXT_INDEX_URL}") await page.goto(CTEXT_INDEX_URL, wait_until="domcontentloaded") # 停顿等待Cloudflare盾 await page.wait_for_timeout(5000) # 提取所有章节链接 links = await page.locator("a[href*='wiki.pl?if=gb&chapter=']").all() chapter_urls = [] for link in links: href = await link.get_attribute("href") text = await link.inner_text() if href: full_url = "https://ctext.org/" + href if full_url not in chapter_urls: chapter_urls.append(full_url) print(f"找到 {len(chapter_urls)} 个章节链接。") # 遍历前几个章节作为示例 for url in chapter_urls[:2]: print(f"正在抓取章节: {url}") await page.goto(url, wait_until="domcontentloaded") await page.wait_for_timeout(3000) # 提取文本区 text_content = await page.evaluate("""() => { const tds = document.querySelectorAll('td.ctext'); let text = ''; tds.forEach(td => { text += td.innerText + '\\n'; }); return text; }""") print("--- 抓取到的原始文本前100字 ---") print(text_content[:100]) print("--------------------------------") # 解析逻辑(基于假设的文本结构) current_volume = "未知卷" current_rhyme = "未知韵" current_tone = "平声" lines = text_content.split('\n') for line in lines: line = line.strip() if not line: continue # 尝试识别卷和韵(注:CText上的维基文本实际上是未经校对的乱码OCR,因此这里的正则很难完美匹配) if line.startswith("卷"): current_volume = line continue if "声" in line and len(line) < 5: current_tone = line continue # 尝试分离词条和内容(假设词条在行首且长度<4) parts = re.split(r'[::\s]', line, maxsplit=1) if len(parts) == 2: word = parts[0] content = parts[1] else: word = line[0:2] if len(line) > 2 else line content = line[2:] # 按用户要求格式构建字典 if word not in results: results[word] = [] results[word].append({ "卷": current_volume, "大韵": current_rhyme, "声调": current_tone, "词条内容": content }) await browser.close() # 保存为JSON with open("五车韵瑞_示例.json", "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print("提取完成。由于源数据本身存在大量OCR乱码,JSON数据可能需要大量人工清洗。") if __name__ == "__main__": asyncio.run(scrape_ctext())