105 lines
4.0 KiB
Python
105 lines
4.0 KiB
Python
import asyncio
|
||
import json
|
||
import re
|
||
from playwright.async_api import async_playwright
|
||
|
||
CTEXT_INDEX_URL = "https://ctext.org/wiki.pl?if=gb&res=87723&remap=gb"
|
||
|
||
async def scrape_ctext():
|
||
results = {}
|
||
|
||
# 启动Playwright(必须使用非无头模式以绕过反爬)
|
||
async with async_playwright() as p:
|
||
browser = await p.chromium.launch(headless=False)
|
||
context = await browser.new_context(
|
||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
viewport={"width": 1280, "height": 800}
|
||
)
|
||
page = await context.new_page()
|
||
|
||
print(f"正在访问目录页: {CTEXT_INDEX_URL}")
|
||
await page.goto(CTEXT_INDEX_URL, wait_until="domcontentloaded")
|
||
|
||
# 停顿等待Cloudflare盾
|
||
await page.wait_for_timeout(5000)
|
||
|
||
# 提取所有章节链接
|
||
links = await page.locator("a[href*='wiki.pl?if=gb&chapter=']").all()
|
||
chapter_urls = []
|
||
for link in links:
|
||
href = await link.get_attribute("href")
|
||
text = await link.inner_text()
|
||
if href:
|
||
full_url = "https://ctext.org/" + href
|
||
if full_url not in chapter_urls:
|
||
chapter_urls.append(full_url)
|
||
|
||
print(f"找到 {len(chapter_urls)} 个章节链接。")
|
||
|
||
# 遍历前几个章节作为示例
|
||
for url in chapter_urls[:2]:
|
||
print(f"正在抓取章节: {url}")
|
||
await page.goto(url, wait_until="domcontentloaded")
|
||
await page.wait_for_timeout(3000)
|
||
|
||
# 提取文本区
|
||
text_content = await page.evaluate("""() => {
|
||
const tds = document.querySelectorAll('td.ctext');
|
||
let text = '';
|
||
tds.forEach(td => { text += td.innerText + '\\n'; });
|
||
return text;
|
||
}""")
|
||
|
||
print("--- 抓取到的原始文本前100字 ---")
|
||
print(text_content[:100])
|
||
print("--------------------------------")
|
||
|
||
# 解析逻辑(基于假设的文本结构)
|
||
current_volume = "未知卷"
|
||
current_rhyme = "未知韵"
|
||
current_tone = "平声"
|
||
|
||
lines = text_content.split('\n')
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line: continue
|
||
|
||
# 尝试识别卷和韵(注:CText上的维基文本实际上是未经校对的乱码OCR,因此这里的正则很难完美匹配)
|
||
if line.startswith("卷"):
|
||
current_volume = line
|
||
continue
|
||
if "声" in line and len(line) < 5:
|
||
current_tone = line
|
||
continue
|
||
|
||
# 尝试分离词条和内容(假设词条在行首且长度<4)
|
||
parts = re.split(r'[::\s]', line, maxsplit=1)
|
||
if len(parts) == 2:
|
||
word = parts[0]
|
||
content = parts[1]
|
||
else:
|
||
word = line[0:2] if len(line) > 2 else line
|
||
content = line[2:]
|
||
|
||
# 按用户要求格式构建字典
|
||
if word not in results:
|
||
results[word] = []
|
||
|
||
results[word].append({
|
||
"卷": current_volume,
|
||
"大韵": current_rhyme,
|
||
"声调": current_tone,
|
||
"词条内容": content
|
||
})
|
||
|
||
await browser.close()
|
||
|
||
# 保存为JSON
|
||
with open("五车韵瑞_示例.json", "w", encoding="utf-8") as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
|
||
print("提取完成。由于源数据本身存在大量OCR乱码,JSON数据可能需要大量人工清洗。")
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(scrape_ctext())
|