Update: 删除垃圾程序

This commit is contained in:
denglifan
2026-03-22 16:43:10 +08:00
parent 183b842090
commit 2881163106
41 changed files with 93 additions and 2046 deletions

View File

@@ -1,8 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
siai_citiao = data.get("𩅰", {}).get("词条", {})
for k, v in list(siai_citiao.items())[:20]:
print(f"{k}: {v[:30]}...")

View File

@@ -1,7 +0,0 @@
import json
with open('peiwenyunfu_v2.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for k in ['𩅰', '', '', '']:
print(f"Rhyme: {k}")
print(json.dumps(data.get(k, {}), ensure_ascii=False, indent=2))

View File

@@ -1,71 +0,0 @@
import os
import time
import urllib.request
import urllib.parse
from urllib.error import HTTPError
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("still_missing.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://zh.wikisource.org/'
}
count = 0
total = len(urls)
for url in urls:
vol = urllib.parse.unquote(url).split('/')[-1]
filename = f'html_files/{vol}.html'
if os.path.exists(filename):
print(f"[{count+1}/{total}] Skipping {vol} (exists)")
count += 1
continue
print(f"[{count+1}/{total}] Downloading {vol}...")
success = False
for attempt in range(5):
try:
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req, timeout=15)
html = response.read().decode('utf-8')
with open(filename, 'w', encoding='utf-8') as out_f:
out_f.write(html)
print(f" -> Success!")
success = True
break
except HTTPError as e:
if e.code == 403:
print(f" -> 403 Forbidden. Waiting 5 seconds...")
time.sleep(5)
else:
print(f" -> HTTP Error {e.code}")
except Exception as e:
print(f" -> Error: {e}")
time.sleep(2)
if not success:
print(f" -> Failed all attempts.")
count += 1
time.sleep(1) # Be nice to server
if __name__ == "__main__":
main()

View File

@@ -1,110 +0,0 @@
import urllib.request
from bs4 import BeautifulSoup
import urllib.parse
import re
import os
url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
def chinese_to_arabic(cn_str):
cn_num = {
"": 0, "": 0, "": 1, "": 2, "": 3, "": 4,
"": 5, "": 6, "": 7, "": 8, "": 9, "": 10,
"": 100, "": 1000,
}
result = 0
temp = 0
for char in cn_str:
if char in ["", ""]:
if temp == 0:
temp = 1
result += temp * cn_num[char]
temp = 0
elif char == "":
if temp == 0:
temp = 1
if len(cn_str) == 1:
return 10
elif result == 0 and temp == 1 and cn_str[0] == "":
result += 10
temp = 0
else:
result += temp * cn_num[char]
temp = 0
else:
temp = cn_num.get(char, 0)
result += temp
return result
def get_filename(vol_str):
m = re.match(r"卷(.+?)之(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
v2 = chinese_to_arabic(m.group(2))
return f"{v1:03d}{v2}.html"
m = re.match(r"卷(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
return f"{v1:03d}.html"
return vol_str + ".html"
def main():
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=30) as response:
html = response.read().decode("utf-8")
except Exception as e:
print("Failed to fetch:", e)
# To avoid failing the test if the network is down but it mocks something else
# we can just pass here, but normally a mocked urllib would work.
pass
try:
soup = BeautifulSoup(html, "html.parser")
except NameError:
return
html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
missing_urls = []
seen_urls = set()
base_url = "https://zh.wikisource.org"
for a in soup.find_all("a"):
href = a.get("href")
if not href:
continue
unquoted_href = urllib.parse.unquote(href)
# Only include `卷XXX之Y` links (ignore 全览 files)
if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
title_part = unquoted_href.split("/")[-1]
if "全覽" in title_part or "全览" in title_part:
continue
# Filter for `卷XXX之Y` pattern if strictly needed.
# But let's check regex pattern "卷.+?之.+"
if not re.match(r"卷.+?之.+", title_part):
continue
full_url = urllib.parse.urljoin(base_url, href)
full_url = full_url.split('#')[0]
if full_url in seen_urls:
continue
seen_urls.add(full_url)
filename = get_filename(title_part)
if filename not in existing_files:
missing_urls.append(full_url)
with open("missing_urls.txt", "w", encoding="utf-8") as f:
for u in missing_urls:
f.write(u + "\n")
print(f"Found {len(missing_urls)} missing URLs.")
if __name__ == "__main__":
main()

View File

@@ -1,14 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]: continue
if "词条" in r_data:
for word, content in r_data["词条"].items():
if "" in content:
r_data["词条"][word] = content.replace("", rhyme)
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

View File

@@ -1,69 +0,0 @@
import json
import re
print("Loading peiwenyunfu.json...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
prefixes = ["韻藻", "", ""]
def clean_headword(word):
clean_word = word
# Try stripping prefixes
for _ in range(2): # In case there's "増韻藻" or something
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
return clean_word
def replace_pipes_in_content(content, word):
clean_word = clean_headword(word)
word_len = len(clean_word)
if word_len == 0 or "" not in content:
return content
def repl(match):
nonlocal pipe_idx
block = match.group(0)
block_len = len(block)
if block_len % word_len == 0:
# Full word match! Reset alignment.
pipe_idx = 0
return clean_word * (block_len // word_len)
else:
# Partial word match. Use current sequence.
res = ""
for _ in range(block_len):
res += clean_word[pipe_idx % word_len]
pipe_idx += 1
return res
pipe_idx = 0
return re.sub(r'丨+', repl, content)
print("Processing...")
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
# 1. Fix 小韵描述
if "小韵描述" in r_data and r_data["小韵描述"]:
# The placeholder should be replaced by the rhyme char
# BUT wait! The rhyme char might be simplified in our dictionary keys!
# The user's prompt used "东" for replacement in 小韵描述.
# So we just use the dictionary key `rhyme`.
r_data["小韵描述"] = r_data["小韵描述"].replace("", rhyme)
# 2. Fix 词条
if "词条" in r_data:
new_citiao = {}
for word, content in r_data["词条"].items():
new_citiao[word] = replace_pipes_in_content(content, word)
r_data["词条"] = new_citiao
print("Saving peiwenyunfu.json...")
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Done!")

View File

@@ -1,24 +0,0 @@
import json
print("Loading...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]: continue
if "对语" in r_data and r_data["对语"]:
r_data["对语"] = r_data["对语"].replace("", rhyme)
if "摘句" in r_data and r_data["摘句"]:
r_data["摘句"] = r_data["摘句"].replace("", rhyme)
if "词条" in r_data:
new_citiao = {}
for word, content in r_data["词条"].items():
new_word = word.replace("", rhyme)
new_citiao[new_word] = content
r_data["词条"] = new_citiao
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Done!")

View File

@@ -1,57 +0,0 @@
import json
print("Loading...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
new_data = {}
new_data['metadata'] = data['metadata']
new_data['preface'] = data['preface']
prefixes = ["韻藻", "韵藻", "", ""]
def clean_key(k):
changed = True
while changed:
changed = False
for p in prefixes:
if k.startswith(p) and len(k) > len(p):
k = k[len(p):]
changed = True
return k
for rhyme, r_data in data.items():
if rhyme in ['metadata', 'preface']: continue
# Store the original main rhyme
new_data[rhyme] = {
"": r_data[""],
"": r_data[""],
"": r_data[""],
"小韵描述": r_data["小韵描述"],
"韵藻": {},
"对语": r_data.get("对语", ""),
"摘句": r_data.get("摘句", "")
}
current_rhyme = rhyme
for k, v in r_data.get("词条", {}).items():
if len(k) == 1 and any(x in v[:15] for x in ['', '説文', '廣韻', '玉篇', '集韻', '韻㑹', '', '', '釋名', '爾雅']):
current_rhyme = k
new_data[current_rhyme] = {
"": r_data[""],
"": r_data[""],
"": r_data[""],
"小韵描述": k + v,
"韵藻": {},
"对语": "",
"摘句": ""
}
else:
cleaned = clean_key(k)
new_data[current_rhyme]["韵藻"][cleaned] = v
with open('peiwenyunfu_v2.json', 'w', encoding='utf-8') as f:
json.dump(new_data, f, ensure_ascii=False, indent=4)
print(f"Old size: {len(data)}, New size: {len(new_data)}")

View File

@@ -1,28 +0,0 @@
import json
print("Loading peiwenyunfu.json...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
current_vol = ""
fixed_count = 0
for rhyme, r_data in data.items():
if rhyme in ['metadata', 'preface']:
continue
vol = r_data.get("", "")
if vol.strip():
# Update current valid volume
current_vol = vol.strip()
else:
# If volume is empty, use the current_vol
if current_vol:
r_data[""] = current_vol
fixed_count += 1
print(f"Fixed {fixed_count} missing volumes.")
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Saved to peiwenyunfu.json!")

View File

@@ -1,83 +0,0 @@
import os
import glob
import json
import re
from parser import parse_html
def natural_sort_key(s):
basename = os.path.basename(s)
# Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
# Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
if m:
vol = int(m.group(1))
sub = int(m.group(2)) if m.group(2) else 0
return (1, vol, sub)
# fallback
return (2, 0, 0)
def main():
# Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
files = glob.glob('html_files/卷*.html')
files.sort(key=natural_sort_key)
print(f"Starting to parse {len(files)} files...")
combined_result = {}
success_count = 0
fail_count = 0
for idx, fpath in enumerate(files):
try:
res = parse_html(fpath)
for k, v in res.items():
# Remove "(韵母)" prefix
clean_key = k.replace("(韵母)", "")
if clean_key not in combined_result:
combined_result[clean_key] = v
else:
# Merge entries
combined_result[clean_key]["词条"].update(v.get("词条", {}))
if "对语" in v and v["对语"]:
combined_result[clean_key]["对语"] += v["对语"]
if "摘句" in v and v["摘句"]:
combined_result[clean_key]["摘句"] += v["摘句"]
# Also, if the initial file didn't have "卷" properly parsed, update it
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
success_count += 1
if idx % 50 == 0:
print(f"Parsed {idx}/{len(files)} files...")
except Exception as e:
print(f"Failed to parse {fpath}: {e}")
fail_count += 1
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
# Construct final output with metadata
final_output = {
"metadata": {
"title": "御定佩文韵府",
"author": "张玉书等",
"dynasty": "",
"total_volumes": 106,
"source": "2026年3月22日从维基文库导出"
},
"preface": "",
"content": combined_result
}
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(final_output, f, ensure_ascii=False, indent=4)
print("Saved output to peiwenyunfu.json")
if __name__ == "__main__":
main()

View File

@@ -1,79 +0,0 @@
import os
import glob
import json
import re
from parser import parse_html
def natural_sort_key(s):
basename = os.path.basename(s)
# Match "卷XXX之YY"
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
if m:
vol = int(m.group(1))
sub = int(m.group(2)) if m.group(2) else 0
return (1, vol, sub)
return (2, 0, 0)
def main():
files = glob.glob('html_files/卷*.html')
files.sort(key=natural_sort_key)
print(f"Starting to parse {len(files)} files...")
combined_result = {}
success_count = 0
fail_count = 0
for idx, fpath in enumerate(files):
try:
res = parse_html(fpath)
for k, v in res.items():
# Remove "(韵母)" prefix
clean_key = k.replace("(韵母)", "")
if clean_key not in combined_result:
combined_result[clean_key] = v
else:
# Merge entries
combined_result[clean_key]["词条"].update(v.get("词条", {}))
if "对语" in v and v["对语"]:
combined_result[clean_key]["对语"] += v["对语"]
if "摘句" in v and v["摘句"]:
combined_result[clean_key]["摘句"] += v["摘句"]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
success_count += 1
if idx % 50 == 0:
print(f"Parsed {idx}/{len(files)} files...")
except Exception as e:
print(f"Failed to parse {fpath}: {e}")
fail_count += 1
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
final_output = {
"metadata": {
"title": "御定佩文韵府",
"author": "张玉书等奉敕编",
"dynasty": "",
"total_volumes": 106,
"source": "2026年3月22日从维基文库导出"
},
"preface": ""
}
final_output.update(combined_result)
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(final_output, f, ensure_ascii=False, indent=4)
print("Saved output to peiwenyunfu.json")
if __name__ == "__main__":
main()

View File

@@ -1,9 +0,0 @@
def simplify_char(c):
mapping = {'': '', '': '', '': ''} # add others if needed
return mapping.get(c, c)
num_map = {'':'1', '':'2', '':'3', '':'4', '':'5', '':'6', '':'7', '':'8', '':'9'}
# "01之一"
# If vol_m = re.search(r"卷(.)之(.)", text)
# "0" + num_map[vol_m.group(1)] + "之" + vol_m.group(2)

View File

@@ -1,124 +0,0 @@
import re
from bs4 import BeautifulSoup
def simplify(text):
mapping = {
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
}
for k, v in mapping.items():
text = text.replace(k, v)
return text
def parse_html(file_path):
with open(file_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if not poem_div:
return {}
text = poem_div.get_text()
# Extract Tone and Rhyme
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
if tone_rhyme_m:
raw_rhyme = tone_rhyme_m.group(2).strip()
m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('', ''))
else:
current_rhyme = ""
# Extract Volume
vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
if vol_m:
num_map = {'':'1', '':'2', '':'3', '':'4', '':'5', '':'6', '':'7', '':'8', '':'9', '':'10'}
v1 = vol_m.group(1)
v1_digit = "".join(num_map.get(c, c) for c in v1)
if len(v1_digit) == 1:
v1_str = f"0{v1_digit}"
else:
v1_str = v1_digit
# Fix: the volume match might capture extra text after '之', e.g. '之一\n'
v2 = vol_m.group(2).split('\n')[0].strip()
current_vol = f"{v1_str}{v2}"
else:
current_vol = ""
# Extract chars
lines = text.split('\n')
rhyme_chars = []
for i, line in enumerate(lines):
if tone_rhyme_m and tone_rhyme_m.group(2) in line:
chars_line = lines[i+1]
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
break
clean_lines = []
for line in lines:
stripped = line.strip()
if not stripped: continue
if stripped == "欽定四庫全書": continue
if stripped.startswith("御定佩文韻府"): continue
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
if "" in stripped: continue
if all(c in rhyme_chars or c in '  ' for c in stripped):
continue
clean_lines.append(stripped)
clean_text = "".join(clean_lines)
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
result = {}
current_char = None
for word, desc_blocks in tokens:
word = word.strip()
desc_content = desc_blocks.replace('', '').replace('', '')
# Is this a main character definition?
if word in rhyme_chars:
current_char = word
simplified_char = simplify(current_char)
# Create a new entry in result
key = f"(韵母){simplified_char}"
if key not in result:
result[key] = {
"": current_vol,
"": current_tone,
"": current_rhyme,
"小韵描述": simplify(current_char + desc_content),
"词条": {},
"对语": "",
"摘句": ""
}
elif word == "對語" or word == "对语":
if current_char:
key = f"(韵母){simplify(current_char)}"
# Could be multiple parts, though usually one per character block
result[key]["对语"] += desc_content
elif word == "摘句":
if current_char:
key = f"(韵母){simplify(current_char)}"
result[key]["摘句"] += desc_content
else:
# It's a 词条
if current_char and word:
key = f"(韵母){simplify(current_char)}"
result[key]["词条"][word] = desc_content
return result

View File

@@ -1,65 +0,0 @@
import os
import time
import urllib.parse
import requests
from requests.exceptions import RequestException
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("missing_urls.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
for url in urls:
# Decode the URL to get the original characters
decoded_url = urllib.parse.unquote(url)
# Extract the volume name, e.g., 卷001之1
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
if os.path.exists(filename):
print(f"Skipping {filename}, already exists.")
continue
print(f"Downloading {volume_name} from {url}...")
success = False
retries = 3
while not success and retries > 0:
try:
response = requests.get(
url, headers=headers, proxies=proxies, timeout=15
)
if response.status_code == 200:
with open(filename, "w", encoding="utf-8") as out_f:
out_f.write(response.text)
success = True
print(f"Successfully downloaded {filename}")
else:
print(
f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
)
except RequestException as e:
print(f"Error downloading {url}: {e}. Retries left: {retries - 1}")
if not success:
retries -= 1
time.sleep(2) # Delay before retry
if not success:
print(f"Failed to download {url} after all retries.")
time.sleep(0.5) # Small delay between requests
if __name__ == "__main__":
main()

View File

@@ -1,65 +0,0 @@
import asyncio
import aiohttp
from aiohttp_socks import ProxyConnector, ProxyType
import os
import urllib.parse
import time
async def download_file(session, url, filename, semaphore):
async with semaphore:
if os.path.exists(filename):
print(f"Skipping {filename}")
return True
retries = 3
while retries > 0:
try:
async with session.get(url, timeout=30) as response:
if response.status == 200:
content = await response.read()
with open(filename, "wb") as f:
f.write(content)
print(f"Successfully downloaded {filename}")
return True
else:
print(f"HTTP Error {response.status} for {url}")
except Exception as e:
print(f"Error for {url}: {e}")
retries -= 1
await asyncio.sleep(2)
print(f"Failed all retries for {url}")
return False
async def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("missing_urls.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
# To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
# aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
proxy_url = "socks5://127.0.0.1:10808"
connector = ProxyConnector.from_url(proxy_url, rdns=True)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
semaphore = asyncio.Semaphore(10) # 10 concurrent downloads
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
tasks = []
for url in urls:
decoded_url = urllib.parse.unquote(url)
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
tasks.append(download_file(session, url, filename, semaphore))
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,65 +0,0 @@
import os
import time
import urllib.parse
import requests
from requests.exceptions import RequestException
import concurrent.futures
def download_url(url):
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
decoded_url = urllib.parse.unquote(url)
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
if os.path.exists(filename):
# Already exists
return True, filename
success = False
retries = 3
while not success and retries > 0:
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
if response.status_code == 200:
with open(filename, "w", encoding="utf-8") as out_f:
out_f.write(response.text)
success = True
print(f"Successfully downloaded {filename}")
return True, filename
else:
print(f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}")
except RequestException as e:
pass
if not success:
retries -= 1
time.sleep(2)
if not success:
print(f"Failed to download {url} after all retries.")
return False, filename
return False, filename
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("missing_urls.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
# Use ThreadPoolExecutor to download multiple files concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(download_url, url) for url in urls]
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"Exception during download: {e}")
if __name__ == "__main__":
main()

View File

@@ -1,79 +0,0 @@
import os
import time
import urllib.parse
import requests
from requests.exceptions import RequestException
import concurrent.futures
def download_url(url):
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
decoded_url = urllib.parse.unquote(url)
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
if os.path.exists(filename):
return True, filename
success = False
retries = 5
while not success and retries > 0:
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
if response.status_code == 200:
with open(filename, "w", encoding="utf-8") as out_f:
out_f.write(response.text)
success = True
print(f"Successfully downloaded {filename}")
return True, filename
elif response.status_code == 403:
print(
f"HTTP Error 403 for {url}. Waiting longer to avoid rate limit..."
)
time.sleep(5)
else:
print(
f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
)
except RequestException as e:
pass
if not success:
retries -= 1
time.sleep(3)
if not success:
print(f"Failed to download {url} after all retries.")
return False, filename
return False, filename
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("still_missing.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
print(f"Starting download for {len(urls)} remaining files...")
# Use ThreadPoolExecutor to download multiple files concurrently
# Reduced max_workers to 5 to avoid triggering 403 Forbidden rate limits
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(download_url, url) for url in urls]
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"Exception during download: {e}")
if __name__ == "__main__":
main()

View File

@@ -1,96 +0,0 @@
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7070%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7077%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B02
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B03
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B04
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B05
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B06
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B07
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B08
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B09
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B10
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B9
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B010
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B9
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B010
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B011
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B012
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7106%E4%B9%8B2

View File

@@ -1,53 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
text = soup.find("div", class_="poem").get_text()
# Extract Tone and Rhyme
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
if tone_rhyme_m:
print("Tone:", tone_rhyme_m.group(1))
print("Rhyme:", tone_rhyme_m.group(2))
# Extract Volume
vol_m = re.search(r"卷(.)之(.)", text)
if vol_m:
print("Volume:", f"0{vol_m.group(1)}{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}{vol_m.group(2)}")
# Extract chars
lines = text.split('\n')
rhyme_chars = []
for i, line in enumerate(lines):
if tone_rhyme_m.group(2) in line:
# the next line usually has the characters
chars_line = lines[i+1]
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
break
print("Chars:", rhyme_chars)
# Now, we want to strip all lines that are headers.
# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..."
# We can just filter out these known header lines!
clean_lines = []
for line in lines:
stripped = line.strip()
if not stripped: continue
if stripped == "欽定四庫全書": continue
if stripped.startswith("御定佩文韻府"): continue
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
if "" in stripped: continue
# Is it the chars line?
if all(c in rhyme_chars or c in '  ' for c in stripped):
continue
clean_lines.append(stripped)
clean_text = "".join(clean_lines)
print("Start of clean text:", clean_text[:100])
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
print("First token:", tokens[0])

View File

@@ -1,10 +0,0 @@
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if poem_div:
for i, p in enumerate(poem_div.find_all("p")[:20]):
print(f"--- P {i} ---")
print(p.text[:100].replace('\n', ' '))

View File

@@ -1,12 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if poem_div:
lines = poem_div.get_text().split("\n")
lines = [line.strip() for line in lines if line.strip()]
for i, line in enumerate(lines[:50]):
print(f"{i}: {line}")

View File

@@ -1,5 +0,0 @@
import re
text = "對語〈渭北 江東〉〈 平北 安東〉摘句〈力障百川東〉"
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
for i, (word, desc_blocks) in enumerate(tokens):
print(f"Token {i}: WORD='{word}' DESCS={desc_blocks}")

View File

@@ -1,54 +0,0 @@
import json
import re
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
prefixes = ["韻藻", ""]
def replace_pipes(content, word):
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
break # only strip one prefix
word_len = len(clean_word)
if word_len == 0:
return content.replace("", "")
result = []
pipe_idx = 0
chars_since_last_pipe = 0
for char in content:
if char == "":
if chars_since_last_pipe >= 5:
# Long gap -> reset pipe_idx!
# Wait, only reset if we aren't in the middle of a perfect mapping?
# Actually, if the gap is >=5, it's definitely a new occurrence.
pipe_idx = 0
result.append(clean_word[pipe_idx % word_len])
pipe_idx += 1
chars_since_last_pipe = 0
else:
result.append(char)
chars_since_last_pipe += 1
return "".join(result)
# Test specific words
test_cases = [
("首陽東", "詩采葑采葑丨丨之丨"),
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
]
for w, c in test_cases:
print(f"Word: {w}")
print(f"Orig: {c}")
print(f"Fix : {replace_pipes(c, w)}")
print("-" * 40)

View File

@@ -1,37 +0,0 @@
import json
prefixes = ["韻藻", ""]
def replace_pipes_no_reset(content, word):
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
break
word_len = len(clean_word)
if word_len == 0:
return content.replace("", "")
result = []
pipe_idx = 0
for char in content:
if char == "":
result.append(clean_word[pipe_idx % word_len])
pipe_idx += 1
else:
result.append(char)
return "".join(result)
test_cases = [
("首陽東", "詩采葑采葑丨丨之丨"),
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
]
for w, c in test_cases:
print(f"Word: {w}")
print(f"NoRst: {replace_pipes_no_reset(c, w)}")
print("-" * 40)

View File

@@ -1,56 +0,0 @@
import json
import re
prefixes = ["韻藻", ""]
def replace_pipes_hybrid(content, word):
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
break
word_len = len(clean_word)
if word_len == 0:
return content.replace("", "")
def repl(match):
nonlocal pipe_idx
block = match.group(0)
block_len = len(block)
if block_len % word_len == 0:
# Full word match! Reset alignment.
pipe_idx = 0
return clean_word * (block_len // word_len)
else:
# Partial word match. Use current sequence.
res = ""
for _ in range(block_len):
res += clean_word[pipe_idx % word_len]
pipe_idx += 1
return res
pipe_idx = 0
return re.sub(r'丨+', repl, content)
test_cases = [
("首陽東", "詩采葑采葑丨丨之丨"),
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
]
for w, c in test_cases:
print(f"Word: {w}")
print(f"Orig: {c}")
print(f"Hybr: {replace_pipes_hybrid(c, w)}")
print("-" * 40)
test_cases.append(("紫殿東", "時魚躍丨丨丨温庭筠詩一夕丨丨"))
for w, c in test_cases[-1:]:
print(f"Word: {w}")
print(f"Orig: {c}")
print(f"Hybr: {replace_pipes_hybrid(c, w)}")
print("-" * 40)

View File

@@ -1,31 +0,0 @@
import json
from parser import parse_html
def test_parse_html():
file_path = "html_files/卷001之1.html"
result = parse_html(file_path)
# Save for manual inspection
with open("output.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Check that it returns a dictionary
assert isinstance(result, dict)
# Let's see what keys are in it
keys = list(result.keys())
print("Keys found:", keys)
if len(keys) > 0:
first_key = keys[0]
assert "" in result[first_key]
assert "" in result[first_key]
assert "" in result[first_key]
assert "小韵描述" in result[first_key]
assert "词条" in result[first_key]
assert "对语" in result[first_key]
assert "摘句" in result[first_key]
if __name__ == "__main__":
test_parse_html()
print("Tests passed!")

View File

@@ -1,49 +0,0 @@
import json
import re
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# Analyze a few
count_match = 0
count_mismatch = 0
for rhyme, r_data in list(data.items())[:5]: # Skip metadata, preface
if rhyme in ["metadata", "preface"]:
continue
print(f"\nRhyme: {rhyme}")
# 1. 小韵描述
desc = r_data.get("小韵描述", "")
desc_fixed = desc.replace("", rhyme)
print(f"Desc original: {desc[:30]}...")
print(f"Desc fixed: {desc_fixed[:30]}...")
# 2. 词条
for word, content in list(r_data.get("词条", {}).items())[:5]:
pipe_count = content.count("")
word_len = len(word)
if pipe_count == 0:
continue
print(f"Word: {word} (len {word_len}), pipes: {pipe_count}")
print(f"Original: {content}")
# Test replacing
if pipe_count % word_len == 0:
# We can replace them in groups of word_len
fixed_content = ""
pipe_idx = 0
for char in content:
if char == "":
fixed_content += word[pipe_idx % word_len]
pipe_idx += 1
else:
fixed_content += char
print(f"Fixed: {fixed_content}")
count_match += 1
else:
print("MISMATCH length!")
count_mismatch += 1
print(f"\nMatches: {count_match}, Mismatches: {count_mismatch}")

View File

@@ -1,46 +0,0 @@
import asyncio
from playwright.async_api import async_playwright
import urllib.parse
import sys
async def main():
url = sys.argv[1]
proxy_server = "socks5://127.0.0.1:10808"
async with async_playwright() as p:
# Try with proxy
browser = await p.chromium.launch(headless=True, proxy={"server": proxy_server})
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
page = await context.new_page()
try:
print(f"Loading {url} via proxy...")
await page.goto(url, timeout=30000, wait_until="domcontentloaded")
content = await page.content()
print(f"Success! Content length: {len(content)}")
except Exception as e:
print(f"Error via proxy: {e}")
await browser.close()
# Try without proxy
print("Retrying without proxy...")
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
page = await context.new_page()
try:
await page.goto(url, timeout=30000, wait_until="domcontentloaded")
content = await page.content()
print(f"Success! Content length: {len(content)}")
except Exception as e2:
print(f"Error without proxy: {e2}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,21 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
mismatches = []
prefixes_found = set()
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
for word, content in r_data.get("词条", {}).items():
pipe_count = content.count("")
if pipe_count == 0:
continue
if pipe_count % len(word) != 0:
mismatches.append((word, pipe_count))
print(f"Total mismatches: {len(mismatches)}")
for w, p in mismatches[:20]:
print(f"{w} (len {len(w)}), pipes: {p}")

View File

@@ -1,13 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
for word, content in r_data.get("词条", {}).items():
if word in ["紫殿東", "少微東", "東海東", "日夜東", "隔西東"]:
print(f"Word: {word}")
print(f"Content: {content}")
print("-" * 40)

View File

@@ -1,18 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
text = poem_div.get_text()
# Extract the list of characters.
# It appears after "一東韻一" or similar.
m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text)
if m:
print("Tone:", m.group(1))
print("Rhyme:", m.group(2))
print("Chars line:", m.group(3))
rhyme_chars = [c for c in m.group(3).replace(' ', ' ').split() if len(c) == 1]
print("Chars:", rhyme_chars)

View File

@@ -1,32 +0,0 @@
import json
import re
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
prefixes = ["韻藻", ""]
mismatches = []
total_pipes = 0
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
for word, content in r_data.get("词条", {}).items():
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
pipe_count = content.count("")
if pipe_count == 0:
continue
total_pipes += 1
if pipe_count % len(clean_word) != 0:
mismatches.append((word, clean_word, pipe_count, content))
print(f"Total entries with pipes: {total_pipes}")
print(f"Total mismatches after stripping: {len(mismatches)}")
for w, cw, p, c in mismatches[:10]:
print(f"{w} -> {cw} (len {len(cw)}), pipes: {p}")
print(f" {c}")

View File

@@ -1,21 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
text = soup.find("div", class_="poem").get_text().replace('\n', '')
# Remove header junk for this test (just find the first '東〈')
start_idx = text.find('東〈')
text = text[start_idx:]
# Tokenize into pairs of (Word, Description)
# Using regex to find all Word〈Description〉
# Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉
# We can find all chunks of non-〈 characters, followed by one or more 〈...〉
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
for i, (word, desc_blocks) in enumerate(tokens[:20]):
print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...")

View File

@@ -0,0 +1,93 @@
# 类书分类与特点梳理
为辅助专业诗歌创作小程序提供“知识库”,整理现有类书数据的优缺点、特点及在辅助创作中的核心价值如下:
## 一、词条式
这类类书通常以主题(如天文地理、自然物候等)分类,适合根据意象(如“雪”、“月”)进行正向检索找语料。
- **《白孔六帖》**
- **内容与编排顺序**:按天文地理、历法礼仪、生活物件等类目编排。
- **特点**:采辑各种典籍中的成语、典故、短语,多为四字短句,并附带简短的释义。
- **整理问题**:提取很不干净,正文与释义括号混合在一起且缺乏断句。
- **辅助创作价值**可作为提炼“四字骈语”或冷僻典故的原始素材库但需大量NLP清洗。
- **典型例子**`"content": "白髙眀柔克(髙明天也柔克寒暑不干)隂隲下民(言天黙定下民之命)天尊(地卑)..."`
- **《北堂书钞》**
- **内容与编排顺序**:按帝王、后妃、政术、刑法、官职、礼仪等社会制度与名物编排。
- **特点**:成书于隋,引用起自三代、汉、魏,迄于宋、齐。侧重于对概念的追本溯源,对词条解释较为详细。
- **整理问题**JSON提取的正文部分缺乏标点长句粘连。
- **辅助创作价值**:古朴凝练。如创作者需要引用较为正统的经史概念(如写咏史诗),该书能提供最原汁原味的早期语料。
- **典型例子**`"content": "皇者天人之總稱 帝者天號 正氣爲帝 帝者天下之所適王者天下之所徃也..."`
- **《初学记》**
- **内容与编排顺序**:按天、岁时、官职、地理等编排。每个词条下细分为“叙事”、“事对”、“诗文”。
- **特点**:词条内容非常详细、层次分明,具有极强的结构化特征。
- **整理问题**整理得非常干净JSON层级保留了原始分类结构。
- **辅助创作价值****价值极高**。“事对”直接提供了现成的对仗词汇(写诗利器);“诗文”栏目则方便查阅前人咏此物的范本。
- **典型例子**`"事对": "轉葢 倚杵(桓譚新論天如葢轉左旋... 覆盆 轉轂(王充論衡曰..."`
- **《海录碎事》**
- **内容与编排顺序**:按非常细碎的关键词(天、地、衣冠等)分类。
- **特点**:每一类下词条过细(常为生僻两字词),每一词条下内容极少,通常只有一两句包含该词的引文。
- **整理问题**:词条过度碎片化,键名就是细碎词汇。
- **辅助创作价值**:相当于一个“逆向用词示例库”。诗人想用某个生僻意象时,用它查看古人如何将其嵌入诗句中。
- **典型例子**`"曽穹": [{"content": "蹀足循廣除瞬目矖曽穹(文選謝惠連詩)"}]`
- **《骈字类编》**
- **内容与编排顺序**:按天地、时令、山水、珍宝、器物等词汇大类编排。
- **特点**:专收“骈语”(双音节词),词条极多,详细列出了该词在各路经史子集中出现的位置。
- **整理问题**:长段引文粘连,缺少现代标点。
- **辅助创作价值****价值极高**。古诗词创作最核心的就是对“双字词汇”的拿捏,此书就是一个庞大且天然的古典双字词语境库。
- **典型例子**`"天地": "易干夫大人者与天地合其德 又坤天地变化草木蕃天地闭贤人隐..."`
- **《太平御览》**
- **内容与编排顺序**:以天、地、人、事、物为大类顺序。
- **特点**:在前代《修文殿御览》《艺文类聚》等书基础上编纂而成,包罗万象,词条内容全部为原文引文。
- **整理问题**:带有原书排版格式(换行、`《书名》曰`),阅读体验较佳。
- **辅助创作价值**:提供最详实的事物背景知识,适合在需要了解某个意象(如“雪”)的全面历史文化背景时使用。
- **典型例子**`"《三五曆記》曰:未有天地之時,混沌狀如雞子,溟涬始牙..."`
- **《艺文类聚》**
- **内容与编排顺序**:按天、岁时、地理、帝王、人、乐、职官等编排。
- **特点**:事文交织。词条收录较广,既有经史中的“叙事”,也有大量的历代【诗】、【赋】、【赞】。
- **整理问题**:带有原书的标点和分段标记,格式清晰。
- **辅助创作价值**:极好的文学创作资料库,帮助创作者一站式看到某个主题在古代诗文中的各种形态。
- **典型例子**`"【詩】晉傅玄《兩儀詩》曰:兩儀始分.元氣上清.列宿垂象.六位時成..."`
- **《玉海》**
- **内容与编排顺序**:详分天文、地理、典章制度等。
- **特点**:词条常为长篇大论,注重典章制度、天文地理的详细考证。
- **整理问题**:存在词条名拆分或提取不精准的问题(如把“中宫二十八舍”拆断处理)。正文缺乏标点。
- **辅助创作价值**:提供精准宏大的制度与天文星象知识,适合创作偏严肃或庙堂题材的诗歌。
- **典型例子**`"中宫": "漢天文志(史天官書同)中宫天極星其一明者泰一之常居也旁三星三公..."`
- **《渊鉴类函》**
- **内容与编排顺序**:按大部类排布。
- **特点**:清代集大成之作,将引文明确区分为“原”(原类书已有)与“增”(清代新增)。
- **整理问题**:使用空格作为句读分隔,未见全角标点。
- **辅助创作价值**:覆盖面最广的兜底宝库,适合查阅各种意象的演变和最全面的引文集合。
- **典型例子**`"原釋名曰天坦也坦然髙而逺也 增又曰天顯也在上髙顯也..."`
## 二、韵式
这类类书专为押韵而生,以“韵母”或“韵字”为一级分类,适合在写格律诗卡壳、需要找特定韵脚词汇时使用。
- **《佩文韵府》**
- **内容与编排顺序**:按平水韵分类(如“一东”),下系以该字为尾的各种词条及摘句。
- **特点**:非常详细,包含声调、韵部说明,以及海量的带出处短句。以元代《韵府群玉》和明代《五车韵瑞》为基础增补。
- **整理问题**JSON结构层次非常清晰。使用“丨”符号代替原韵字如“東”被替换为“丨”
- **辅助创作价值****写诗必备神器**。想用“东”韵时,能瞬间获得大量以东结尾的词汇(如“南东”、“活东”)及例句,极大辅助押韵。
- **典型例子**`"(韵母)东": { "小韵描述": "东德红切眷方也...", "词条": { "活東": "爾雅科斗丨丨蝦蟇也...", "牆東": "後漢書避世丨丨王君公..." } }`
- **《韵府群玉》**
- **内容与编排顺序**:按大韵分类,列出小韵和具体词条。
- **特点**:早期的韵书,条目较为简练紧凑。
- **整理问题**:条目内容被尖括号`〈〉`包裹,夹杂部分注音(如“徳紅切”)。
- **辅助创作价值**:与佩文韵府同理,但体量更小,适合快速查阅核心的传统押韵典故。
- **典型例子**`"東": { "道東": "〈漢鄭𤣥事馬融辭歸融曰吾道東矣本〉" }`
- **《五车韵瑞》**
- **内容与编排顺序**/
- **特点**/
- **整理问题****严重问题**,当前文件夹内的 `allorigins.json` 数据获取失败,内容实际上是 Nginx 的 `500 Internal Server Error` 报错网页代码并非JSON数据。
- **辅助创作价值**:暂时无价值。需要修复爬虫和数据源。
- **典型例子**`<html><head><title>500 Internal Server Error</title></head><body>...`

View File

@@ -1,161 +0,0 @@
import bs4
import os
import re
import json
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
events = []
for div in poem_divs:
for p in div.find_all("p"):
for child in p.children:
if child.name == "br":
pass # ignore br, we want continuous text
elif child.name == "span" and child.get("id") and child.text.strip().endswith(""):
events.append(("section", child.text.strip()))
elif child.name == "small":
small_text = child.get_text()
if not small_text.startswith("") and not small_text.startswith(""):
events.append(("text", f"{small_text}"))
else:
events.append(("text", small_text))
elif isinstance(child, str):
events.append(("text", child))
else:
events.append(("text", child.get_text()))
return events
def extract_sections(content_text):
result = {"叙事": "", "事对": "", "诗文": ""}
rest_after_narrative = content_text
shìduì_start = rest_after_narrative.find("事對")
if shìduì_start == -1:
shìduì_start = rest_after_narrative.find("事对")
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"
if shìduì_start != -1:
result["叙事"] = rest_after_narrative[:shìduì_start]
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["事对"] = rest_after_shiduì[:split_idx]
result["诗文"] = rest_after_shiduì[split_idx:]
else:
result["事对"] = rest_after_shiduì
else:
shiwen_match = re.search(genre_pattern, rest_after_narrative)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["叙事"] = rest_after_narrative[:split_idx]
result["诗文"] = rest_after_narrative[split_idx:]
else:
result["叙事"] = rest_after_narrative
for k in result:
result[k] = result[k].replace("", "").replace("", "").strip()
return result
def main():
categories = {}
total_entries_found = 0
for vol in range(1, 31):
filename = None
for fn in os.listdir(html_dir):
if fn.endswith(f"juan{vol:02d}.xhtml"):
filename = fn
break
if not filename:
print(f"Volume {vol} not found")
continue
events = parse_html(os.path.join(html_dir, filename))
merged = []
current_section = ""
current_text = []
for ev_type, val in events:
if ev_type == "section":
if current_text:
merged.append((current_section, "".join(current_text)))
current_text = []
current_section = val
else:
current_text.append(val)
if current_text:
merged.append((current_section, "".join(current_text)))
for sec, txt in merged:
matches = list(re.finditer(r"(?:^|[〉)\u3000\s])([^〈〉<>\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))
if not matches:
continue
for i, m in enumerate(matches):
entry_name = m.group(1).strip()
start_idx = m.end() # Start of the content AFTER 〈叙事〉
# Find the actual start index of the next match
# m.start(1) skips the optional prefix character!
# Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
# If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
# Actually, `matches[i+1].start()` includes the boundary char `〉`.
# We should stop at `matches[i+1].start(1) - len(prefix)`?
# Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
# E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
# The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
# So we MUST include it!
# `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
# This is EXACTLY what we want!
end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)
content_text = txt[start_idx:end_idx]
sections = extract_sections(content_text)
if entry_name not in categories:
categories[entry_name] = []
categories[entry_name].append({
"volume": vol,
"section": sec,
"content": sections
})
total_entries_found += 1
final_json = {
"metadata": {
"title": "初学记",
"author": "徐坚",
"dynasty": "",
"total_volumes": 30,
"source": "2026年1月28日从维基文库导出"
},
"preface": "",
"categories": categories
}
with open("初学记.json", "w", encoding="utf-8") as f:
json.dump(final_json, f, ensure_ascii=False, indent=2)
print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")
if __name__ == "__main__":
main()

View File

@@ -1,24 +0,0 @@
import bs4
import re
import sys
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
current_section = None
poem_divs = soup.find_all("div", class_="poem")
for div in poem_divs:
# Before we process children, let's look at p tags
for p in div.find_all("p"):
for child in p.children:
if child.name == "span" and child.get("id") and child.text.strip().endswith(""):
current_section = child.text.strip()
print("Found Section:", current_section)
elif child.name == "br":
pass
elif type(child) == bs4.element.NavigableString:
pass
parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")

View File

@@ -1,54 +0,0 @@
import bs4
import re
import os
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
events = []
for div in poem_divs:
for p in div.find_all("p"):
current_line = []
for child in p.children:
if child.name == "br":
if current_line:
events.append(("line", "".join(current_line).strip()))
current_line = []
elif (
child.name == "span"
and child.get("id")
and child.text.strip().endswith("")
):
events.append(("section", child.text.strip()))
elif child.name == "small":
# Convert small text to brackets
small_text = child.get_text()
# It might already have brackets inside.
if not small_text.startswith("") and not small_text.startswith(
""
):
current_line.append(f"{small_text}")
else:
# Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
# Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
# Let's look at child.get_text()
current_line.append(small_text)
else:
current_line.append(child.get_text())
if current_line:
events.append(("line", "".join(current_line).strip()))
return events
events = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
for e in events[:30]:
print(e)

View File

@@ -1,127 +0,0 @@
import bs4
import os
import re
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
events = []
for div in poem_divs:
for p in div.find_all("p"):
current_line = []
for child in p.children:
if child.name == "br":
if current_line:
events.append(("line", "".join(current_line).strip()))
current_line = []
elif (
child.name == "span"
and child.get("id")
and child.text.strip().endswith("")
):
events.append(("section", child.text.strip()))
elif child.name == "small":
small_text = child.get_text()
if not small_text.startswith("") and not small_text.startswith(
""
):
current_line.append(f"{small_text}")
else:
current_line.append(small_text)
else:
current_line.append(child.get_text())
if current_line:
events.append(("line", "".join(current_line).strip()))
return events
def extract_sections(entry_text):
result = {"叙事": "", "事对": "", "诗文": ""}
narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
if not narrative_match:
return result
rest_after_narrative = narrative_match.group(2)
shìduì_start = rest_after_narrative.find("事對")
if shìduì_start == -1:
shìduì_start = rest_after_narrative.find("事对")
if shìduì_start != -1:
result["叙事"] = rest_after_narrative[:shìduì_start]
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["事对"] = rest_after_shiduì[:split_idx]
result["诗文"] = rest_after_shiduì[split_idx:]
else:
result["事对"] = rest_after_shiduì
else:
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
shiwen_match = re.search(genre_pattern, rest_after_narrative)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["叙事"] = rest_after_narrative[:split_idx]
result["诗文"] = rest_after_narrative[split_idx:]
else:
result["叙事"] = rest_after_narrative
for k in result:
result[k] = result[k].replace("", "").replace("", "")
return result
events = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
entries = []
current_entry_text = []
for ev_type, text in events:
if ev_type == "section":
pass
else:
if "〈叙事〉" in text or "〈敘事〉" in text:
if current_entry_text:
full_text = "".join(current_entry_text)
match = re.search(
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
full_text,
)
if match:
entry_name = match.group(1).strip()
sections = extract_sections(full_text)
entries.append((entry_name, sections))
current_entry_text = [text]
else:
current_entry_text.append(text)
if current_entry_text:
full_text = "".join(current_entry_text)
match = re.search(
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
)
if match:
entry_name = match.group(1).strip()
sections = extract_sections(full_text)
entries.append((entry_name, sections))
for entry_name, sections in entries[:3]:
print("Entry:", entry_name)
print("叙事:", sections["叙事"][:50])
print("事对:", sections["事对"][:50])
print("诗文:", sections["诗文"][:50])
print("-" * 20)

View File

@@ -1,35 +0,0 @@
import bs4
import os
import json
import re
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
texts = []
for div in poem_divs:
for p in div.find_all("p"):
# We want to process the text inside <p> while respecting <br/> as separators.
# But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
current_line = []
for child in p.children:
if child.name == "br":
texts.append("".join(current_line).strip())
current_line = []
else:
current_line.append(child.get_text())
if current_line:
texts.append("".join(current_line).strip())
return texts
texts = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
for i, t in enumerate(texts[:50]):
print(f"{i}: {t}")

View File

@@ -1,93 +0,0 @@
import bs4
import os
import json
import re
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
texts = []
for div in poem_divs:
for p in div.find_all("p"):
current_line = []
for child in p.children:
if child.name == "br":
texts.append("".join(current_line).strip())
current_line = []
else:
current_line.append(child.get_text())
if current_line:
texts.append("".join(current_line).strip())
return texts
def extract_sections(entry_text):
result = {"叙事": "", "事对": "", "诗文": ""}
# Extract 叙事
narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
if not narrative_match:
return result
rest_after_narrative = narrative_match.group(1)
# Find 事对
shìduì_start = rest_after_narrative.find("事對")
if shìduì_start == -1:
shìduì_start = rest_after_narrative.find("事对")
if shìduì_start != -1:
result["叙事"] = rest_after_narrative[:shìduì_start]
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] # skip "事對"
# Find 诗文 start
# Match 〉 followed by a literary genre
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
if shiwen_match:
split_idx = shiwen_match.start() + 1 # keep the genre character
result["事对"] = rest_after_shiduì[:split_idx]
result["诗文"] = rest_after_shiduì[split_idx:]
else:
result["事对"] = rest_after_shiduì
else:
# No 事对
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
shiwen_match = re.search(genre_pattern, rest_after_narrative)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["叙事"] = rest_after_narrative[:split_idx]
result["诗文"] = rest_after_narrative[split_idx:]
else:
result["叙事"] = rest_after_narrative
return result
texts = parse_html(
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
)
entries = []
for line in texts:
if "〈叙事〉" in line:
entry_name = line.split("〈叙事〉")[0]
# remove "第X" from entry_name
word_name = re.sub(
r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
).strip()
sections = extract_sections(line)
entries.append((word_name, entry_name, sections))
for e in entries[:3]:
print(f"Word: {e[0]}")
print(f"Entry: {e[1]}")
print(f"叙事 len: {len(e[2]['叙事'])}")
print(f"事对 len: {len(e[2]['事对'])}")
print(f"诗文 len: {len(e[2]['诗文'])}")
print("-" * 20)

View File

@@ -1,41 +0,0 @@
import re
def extract_sections(content_text):
result = {"叙事": "", "事对": "", "诗文": ""}
rest_after_narrative = content_text
shìduì_start = rest_after_narrative.find("事對")
if shìduì_start == -1:
shìduì_start = rest_after_narrative.find("事对")
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对|銘|論)"
if shìduì_start != -1:
result["叙事"] = rest_after_narrative[:shìduì_start]
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["事对"] = rest_after_shiduì[:split_idx]
result["诗文"] = rest_after_shiduì[split_idx:]
else:
result["事对"] = rest_after_shiduì
else:
shiwen_match = re.search(genre_pattern, rest_after_narrative)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["叙事"] = rest_after_narrative[:split_idx]
result["诗文"] = rest_after_narrative[split_idx:]
else:
result["叙事"] = rest_after_narrative
for k in result:
result[k] = result[k].replace("", "").replace("", "")
return result
import json
print(extract_sections("這里是叙事事對這裡是事對〉詩這里是詩文"))