Update: 初学记、佩文韵府 and 五车韵瑞

This commit is contained in:
denglifan
2026-03-22 16:18:35 +08:00
parent df475fd03f
commit 183b842090
553 changed files with 754048 additions and 169 deletions

View File

@@ -0,0 +1,104 @@
import asyncio
import json
import re
from playwright.async_api import async_playwright
CTEXT_INDEX_URL = "https://ctext.org/wiki.pl?if=gb&res=87723&remap=gb"
async def scrape_ctext():
results = {}
# 启动Playwright必须使用非无头模式以绕过反爬
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
viewport={"width": 1280, "height": 800}
)
page = await context.new_page()
print(f"正在访问目录页: {CTEXT_INDEX_URL}")
await page.goto(CTEXT_INDEX_URL, wait_until="domcontentloaded")
# 停顿等待Cloudflare盾
await page.wait_for_timeout(5000)
# 提取所有章节链接
links = await page.locator("a[href*='wiki.pl?if=gb&chapter=']").all()
chapter_urls = []
for link in links:
href = await link.get_attribute("href")
text = await link.inner_text()
if href:
full_url = "https://ctext.org/" + href
if full_url not in chapter_urls:
chapter_urls.append(full_url)
print(f"找到 {len(chapter_urls)} 个章节链接。")
# 遍历前几个章节作为示例
for url in chapter_urls[:2]:
print(f"正在抓取章节: {url}")
await page.goto(url, wait_until="domcontentloaded")
await page.wait_for_timeout(3000)
# 提取文本区
text_content = await page.evaluate("""() => {
const tds = document.querySelectorAll('td.ctext');
let text = '';
tds.forEach(td => { text += td.innerText + '\\n'; });
return text;
}""")
print("--- 抓取到的原始文本前100字 ---")
print(text_content[:100])
print("--------------------------------")
# 解析逻辑(基于假设的文本结构)
current_volume = "未知卷"
current_rhyme = "未知韵"
current_tone = "平声"
lines = text_content.split('\n')
for line in lines:
line = line.strip()
if not line: continue
# 尝试识别卷和韵CText上的维基文本实际上是未经校对的乱码OCR因此这里的正则很难完美匹配
if line.startswith(""):
current_volume = line
continue
if "" in line and len(line) < 5:
current_tone = line
continue
# 尝试分离词条和内容(假设词条在行首且长度<4
parts = re.split(r'[:\s]', line, maxsplit=1)
if len(parts) == 2:
word = parts[0]
content = parts[1]
else:
word = line[0:2] if len(line) > 2 else line
content = line[2:]
# 按用户要求格式构建字典
if word not in results:
results[word] = []
results[word].append({
"": current_volume,
"大韵": current_rhyme,
"声调": current_tone,
"词条内容": content
})
await browser.close()
# 保存为JSON
with open("五车韵瑞_示例.json", "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print("提取完成。由于源数据本身存在大量OCR乱码JSON数据可能需要大量人工清洗。")
if __name__ == "__main__":
asyncio.run(scrape_ctext())