Files
spider-ctext/五车韵瑞/scrape_wuche.py
2026-03-22 16:18:35 +08:00

105 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import asyncio
import json
import re
from playwright.async_api import async_playwright
CTEXT_INDEX_URL = "https://ctext.org/wiki.pl?if=gb&res=87723&remap=gb"
async def scrape_ctext():
results = {}
# 启动Playwright必须使用非无头模式以绕过反爬
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={"width": 1280, "height": 800}
)
page = await context.new_page()
print(f"正在访问目录页: {CTEXT_INDEX_URL}")
await page.goto(CTEXT_INDEX_URL, wait_until="domcontentloaded")
# 停顿等待Cloudflare盾
await page.wait_for_timeout(5000)
# 提取所有章节链接
links = await page.locator("a[href*='wiki.pl?if=gb&chapter=']").all()
chapter_urls = []
for link in links:
href = await link.get_attribute("href")
text = await link.inner_text()
if href:
full_url = "https://ctext.org/" + href
if full_url not in chapter_urls:
chapter_urls.append(full_url)
print(f"找到 {len(chapter_urls)} 个章节链接。")
# 遍历前几个章节作为示例
for url in chapter_urls[:2]:
print(f"正在抓取章节: {url}")
await page.goto(url, wait_until="domcontentloaded")
await page.wait_for_timeout(3000)
# 提取文本区
text_content = await page.evaluate("""() => {
const tds = document.querySelectorAll('td.ctext');
let text = '';
tds.forEach(td => { text += td.innerText + '\\n'; });
return text;
}""")
print("--- 抓取到的原始文本前100字 ---")
print(text_content[:100])
print("--------------------------------")
# 解析逻辑(基于假设的文本结构)
current_volume = "未知卷"
current_rhyme = "未知韵"
current_tone = "平声"
lines = text_content.split('\n')
for line in lines:
line = line.strip()
if not line: continue
# 尝试识别卷和韵CText上的维基文本实际上是未经校对的乱码OCR因此这里的正则很难完美匹配
if line.startswith(""):
current_volume = line
continue
if "" in line and len(line) < 5:
current_tone = line
continue
# 尝试分离词条和内容(假设词条在行首且长度<4
parts = re.split(r'[:\s]', line, maxsplit=1)
if len(parts) == 2:
word = parts[0]
content = parts[1]
else:
word = line[0:2] if len(line) > 2 else line
content = line[2:]
# 按用户要求格式构建字典
if word not in results:
results[word] = []
results[word].append({
"": current_volume,
"大韵": current_rhyme,
"声调": current_tone,
"词条内容": content
})
await browser.close()
# 保存为JSON
with open("五车韵瑞_示例.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print("提取完成。由于源数据本身存在大量OCR乱码JSON数据可能需要大量人工清洗。")
if __name__ == "__main__":
asyncio.run(scrape_ctext())