Update: 删除垃圾程序

This commit is contained in:
denglifan
2026-03-22 16:43:10 +08:00
parent 183b842090
commit 2881163106
41 changed files with 93 additions and 2046 deletions

View File

@@ -1,8 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
siai_citiao = data.get("𩅰", {}).get("词条", {})
for k, v in list(siai_citiao.items())[:20]:
print(f"{k}: {v[:30]}...")

View File

@@ -1,7 +0,0 @@
import json
with open('peiwenyunfu_v2.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for k in ['𩅰', '', '', '']:
print(f"Rhyme: {k}")
print(json.dumps(data.get(k, {}), ensure_ascii=False, indent=2))

View File

@@ -1,71 +0,0 @@
import os
import time
import urllib.request
import urllib.parse
from urllib.error import HTTPError
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("still_missing.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Referer': 'https://zh.wikisource.org/'
}
count = 0
total = len(urls)
for url in urls:
vol = urllib.parse.unquote(url).split('/')[-1]
filename = f'html_files/{vol}.html'
if os.path.exists(filename):
print(f"[{count+1}/{total}] Skipping {vol} (exists)")
count += 1
continue
print(f"[{count+1}/{total}] Downloading {vol}...")
success = False
for attempt in range(5):
try:
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req, timeout=15)
html = response.read().decode('utf-8')
with open(filename, 'w', encoding='utf-8') as out_f:
out_f.write(html)
print(f" -> Success!")
success = True
break
except HTTPError as e:
if e.code == 403:
print(f" -> 403 Forbidden. Waiting 5 seconds...")
time.sleep(5)
else:
print(f" -> HTTP Error {e.code}")
except Exception as e:
print(f" -> Error: {e}")
time.sleep(2)
if not success:
print(f" -> Failed all attempts.")
count += 1
time.sleep(1) # Be nice to server
if __name__ == "__main__":
main()

View File

@@ -1,110 +0,0 @@
import urllib.request
from bs4 import BeautifulSoup
import urllib.parse
import re
import os
url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
def chinese_to_arabic(cn_str):
cn_num = {
"": 0, "": 0, "": 1, "": 2, "": 3, "": 4,
"": 5, "": 6, "": 7, "": 8, "": 9, "": 10,
"": 100, "": 1000,
}
result = 0
temp = 0
for char in cn_str:
if char in ["", ""]:
if temp == 0:
temp = 1
result += temp * cn_num[char]
temp = 0
elif char == "":
if temp == 0:
temp = 1
if len(cn_str) == 1:
return 10
elif result == 0 and temp == 1 and cn_str[0] == "":
result += 10
temp = 0
else:
result += temp * cn_num[char]
temp = 0
else:
temp = cn_num.get(char, 0)
result += temp
return result
def get_filename(vol_str):
m = re.match(r"卷(.+?)之(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
v2 = chinese_to_arabic(m.group(2))
return f"{v1:03d}{v2}.html"
m = re.match(r"卷(.+)", vol_str)
if m:
v1 = chinese_to_arabic(m.group(1))
return f"{v1:03d}.html"
return vol_str + ".html"
def main():
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=30) as response:
html = response.read().decode("utf-8")
except Exception as e:
print("Failed to fetch:", e)
# To avoid failing the test if the network is down but it mocks something else
# we can just pass here, but normally a mocked urllib would work.
pass
try:
soup = BeautifulSoup(html, "html.parser")
except NameError:
return
html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
missing_urls = []
seen_urls = set()
base_url = "https://zh.wikisource.org"
for a in soup.find_all("a"):
href = a.get("href")
if not href:
continue
unquoted_href = urllib.parse.unquote(href)
# Only include `卷XXX之Y` links (ignore 全览 files)
if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
title_part = unquoted_href.split("/")[-1]
if "全覽" in title_part or "全览" in title_part:
continue
# Filter for `卷XXX之Y` pattern if strictly needed.
# But let's check regex pattern "卷.+?之.+"
if not re.match(r"卷.+?之.+", title_part):
continue
full_url = urllib.parse.urljoin(base_url, href)
full_url = full_url.split('#')[0]
if full_url in seen_urls:
continue
seen_urls.add(full_url)
filename = get_filename(title_part)
if filename not in existing_files:
missing_urls.append(full_url)
with open("missing_urls.txt", "w", encoding="utf-8") as f:
for u in missing_urls:
f.write(u + "\n")
print(f"Found {len(missing_urls)} missing URLs.")
if __name__ == "__main__":
main()

View File

@@ -1,14 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]: continue
if "词条" in r_data:
for word, content in r_data["词条"].items():
if "" in content:
r_data["词条"][word] = content.replace("", rhyme)
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

View File

@@ -1,69 +0,0 @@
import json
import re
print("Loading peiwenyunfu.json...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
prefixes = ["韻藻", "", ""]
def clean_headword(word):
clean_word = word
# Try stripping prefixes
for _ in range(2): # In case there's "増韻藻" or something
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
return clean_word
def replace_pipes_in_content(content, word):
clean_word = clean_headword(word)
word_len = len(clean_word)
if word_len == 0 or "" not in content:
return content
def repl(match):
nonlocal pipe_idx
block = match.group(0)
block_len = len(block)
if block_len % word_len == 0:
# Full word match! Reset alignment.
pipe_idx = 0
return clean_word * (block_len // word_len)
else:
# Partial word match. Use current sequence.
res = ""
for _ in range(block_len):
res += clean_word[pipe_idx % word_len]
pipe_idx += 1
return res
pipe_idx = 0
return re.sub(r'丨+', repl, content)
print("Processing...")
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
# 1. Fix 小韵描述
if "小韵描述" in r_data and r_data["小韵描述"]:
# The placeholder should be replaced by the rhyme char
# BUT wait! The rhyme char might be simplified in our dictionary keys!
# The user's prompt used "东" for replacement in 小韵描述.
# So we just use the dictionary key `rhyme`.
r_data["小韵描述"] = r_data["小韵描述"].replace("", rhyme)
# 2. Fix 词条
if "词条" in r_data:
new_citiao = {}
for word, content in r_data["词条"].items():
new_citiao[word] = replace_pipes_in_content(content, word)
r_data["词条"] = new_citiao
print("Saving peiwenyunfu.json...")
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Done!")

View File

@@ -1,24 +0,0 @@
import json
print("Loading...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]: continue
if "对语" in r_data and r_data["对语"]:
r_data["对语"] = r_data["对语"].replace("", rhyme)
if "摘句" in r_data and r_data["摘句"]:
r_data["摘句"] = r_data["摘句"].replace("", rhyme)
if "词条" in r_data:
new_citiao = {}
for word, content in r_data["词条"].items():
new_word = word.replace("", rhyme)
new_citiao[new_word] = content
r_data["词条"] = new_citiao
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Done!")

View File

@@ -1,57 +0,0 @@
import json
print("Loading...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
new_data = {}
new_data['metadata'] = data['metadata']
new_data['preface'] = data['preface']
prefixes = ["韻藻", "韵藻", "", ""]
def clean_key(k):
changed = True
while changed:
changed = False
for p in prefixes:
if k.startswith(p) and len(k) > len(p):
k = k[len(p):]
changed = True
return k
for rhyme, r_data in data.items():
if rhyme in ['metadata', 'preface']: continue
# Store the original main rhyme
new_data[rhyme] = {
"": r_data[""],
"": r_data[""],
"": r_data[""],
"小韵描述": r_data["小韵描述"],
"韵藻": {},
"对语": r_data.get("对语", ""),
"摘句": r_data.get("摘句", "")
}
current_rhyme = rhyme
for k, v in r_data.get("词条", {}).items():
if len(k) == 1 and any(x in v[:15] for x in ['', '説文', '廣韻', '玉篇', '集韻', '韻㑹', '', '', '釋名', '爾雅']):
current_rhyme = k
new_data[current_rhyme] = {
"": r_data[""],
"": r_data[""],
"": r_data[""],
"小韵描述": k + v,
"韵藻": {},
"对语": "",
"摘句": ""
}
else:
cleaned = clean_key(k)
new_data[current_rhyme]["韵藻"][cleaned] = v
with open('peiwenyunfu_v2.json', 'w', encoding='utf-8') as f:
json.dump(new_data, f, ensure_ascii=False, indent=4)
print(f"Old size: {len(data)}, New size: {len(new_data)}")

View File

@@ -1,28 +0,0 @@
import json
print("Loading peiwenyunfu.json...")
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
current_vol = ""
fixed_count = 0
for rhyme, r_data in data.items():
if rhyme in ['metadata', 'preface']:
continue
vol = r_data.get("", "")
if vol.strip():
# Update current valid volume
current_vol = vol.strip()
else:
# If volume is empty, use the current_vol
if current_vol:
r_data[""] = current_vol
fixed_count += 1
print(f"Fixed {fixed_count} missing volumes.")
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print("Saved to peiwenyunfu.json!")

View File

@@ -1,83 +0,0 @@
import os
import glob
import json
import re
from parser import parse_html
def natural_sort_key(s):
basename = os.path.basename(s)
# Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
# Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
if m:
vol = int(m.group(1))
sub = int(m.group(2)) if m.group(2) else 0
return (1, vol, sub)
# fallback
return (2, 0, 0)
def main():
# Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
files = glob.glob('html_files/卷*.html')
files.sort(key=natural_sort_key)
print(f"Starting to parse {len(files)} files...")
combined_result = {}
success_count = 0
fail_count = 0
for idx, fpath in enumerate(files):
try:
res = parse_html(fpath)
for k, v in res.items():
# Remove "(韵母)" prefix
clean_key = k.replace("(韵母)", "")
if clean_key not in combined_result:
combined_result[clean_key] = v
else:
# Merge entries
combined_result[clean_key]["词条"].update(v.get("词条", {}))
if "对语" in v and v["对语"]:
combined_result[clean_key]["对语"] += v["对语"]
if "摘句" in v and v["摘句"]:
combined_result[clean_key]["摘句"] += v["摘句"]
# Also, if the initial file didn't have "卷" properly parsed, update it
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
success_count += 1
if idx % 50 == 0:
print(f"Parsed {idx}/{len(files)} files...")
except Exception as e:
print(f"Failed to parse {fpath}: {e}")
fail_count += 1
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
# Construct final output with metadata
final_output = {
"metadata": {
"title": "御定佩文韵府",
"author": "张玉书等",
"dynasty": "",
"total_volumes": 106,
"source": "2026年3月22日从维基文库导出"
},
"preface": "",
"content": combined_result
}
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(final_output, f, ensure_ascii=False, indent=4)
print("Saved output to peiwenyunfu.json")
if __name__ == "__main__":
main()

View File

@@ -1,79 +0,0 @@
import os
import glob
import json
import re
from parser import parse_html
def natural_sort_key(s):
basename = os.path.basename(s)
# Match "卷XXX之YY"
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
if m:
vol = int(m.group(1))
sub = int(m.group(2)) if m.group(2) else 0
return (1, vol, sub)
return (2, 0, 0)
def main():
files = glob.glob('html_files/卷*.html')
files.sort(key=natural_sort_key)
print(f"Starting to parse {len(files)} files...")
combined_result = {}
success_count = 0
fail_count = 0
for idx, fpath in enumerate(files):
try:
res = parse_html(fpath)
for k, v in res.items():
# Remove "(韵母)" prefix
clean_key = k.replace("(韵母)", "")
if clean_key not in combined_result:
combined_result[clean_key] = v
else:
# Merge entries
combined_result[clean_key]["词条"].update(v.get("词条", {}))
if "对语" in v and v["对语"]:
combined_result[clean_key]["对语"] += v["对语"]
if "摘句" in v and v["摘句"]:
combined_result[clean_key]["摘句"] += v["摘句"]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
if not combined_result[clean_key][""] and v[""]:
combined_result[clean_key][""] = v[""]
success_count += 1
if idx % 50 == 0:
print(f"Parsed {idx}/{len(files)} files...")
except Exception as e:
print(f"Failed to parse {fpath}: {e}")
fail_count += 1
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
final_output = {
"metadata": {
"title": "御定佩文韵府",
"author": "张玉书等奉敕编",
"dynasty": "",
"total_volumes": 106,
"source": "2026年3月22日从维基文库导出"
},
"preface": ""
}
final_output.update(combined_result)
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
json.dump(final_output, f, ensure_ascii=False, indent=4)
print("Saved output to peiwenyunfu.json")
if __name__ == "__main__":
main()

View File

@@ -1,9 +0,0 @@
def simplify_char(c):
mapping = {'': '', '': '', '': ''} # add others if needed
return mapping.get(c, c)
num_map = {'':'1', '':'2', '':'3', '':'4', '':'5', '':'6', '':'7', '':'8', '':'9'}
# "01之一"
# If vol_m = re.search(r"卷(.)之(.)", text)
# "0" + num_map[vol_m.group(1)] + "之" + vol_m.group(2)

View File

@@ -1,124 +0,0 @@
import re
from bs4 import BeautifulSoup
def simplify(text):
mapping = {
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
'': '',
}
for k, v in mapping.items():
text = text.replace(k, v)
return text
def parse_html(file_path):
with open(file_path, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if not poem_div:
return {}
text = poem_div.get_text()
# Extract Tone and Rhyme
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
if tone_rhyme_m:
raw_rhyme = tone_rhyme_m.group(2).strip()
m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('', ''))
else:
current_rhyme = ""
# Extract Volume
vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
if vol_m:
num_map = {'':'1', '':'2', '':'3', '':'4', '':'5', '':'6', '':'7', '':'8', '':'9', '':'10'}
v1 = vol_m.group(1)
v1_digit = "".join(num_map.get(c, c) for c in v1)
if len(v1_digit) == 1:
v1_str = f"0{v1_digit}"
else:
v1_str = v1_digit
# Fix: the volume match might capture extra text after '之', e.g. '之一\n'
v2 = vol_m.group(2).split('\n')[0].strip()
current_vol = f"{v1_str}{v2}"
else:
current_vol = ""
# Extract chars
lines = text.split('\n')
rhyme_chars = []
for i, line in enumerate(lines):
if tone_rhyme_m and tone_rhyme_m.group(2) in line:
chars_line = lines[i+1]
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
break
clean_lines = []
for line in lines:
stripped = line.strip()
if not stripped: continue
if stripped == "欽定四庫全書": continue
if stripped.startswith("御定佩文韻府"): continue
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
if "" in stripped: continue
if all(c in rhyme_chars or c in '  ' for c in stripped):
continue
clean_lines.append(stripped)
clean_text = "".join(clean_lines)
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
result = {}
current_char = None
for word, desc_blocks in tokens:
word = word.strip()
desc_content = desc_blocks.replace('', '').replace('', '')
# Is this a main character definition?
if word in rhyme_chars:
current_char = word
simplified_char = simplify(current_char)
# Create a new entry in result
key = f"(韵母){simplified_char}"
if key not in result:
result[key] = {
"": current_vol,
"": current_tone,
"": current_rhyme,
"小韵描述": simplify(current_char + desc_content),
"词条": {},
"对语": "",
"摘句": ""
}
elif word == "對語" or word == "对语":
if current_char:
key = f"(韵母){simplify(current_char)}"
# Could be multiple parts, though usually one per character block
result[key]["对语"] += desc_content
elif word == "摘句":
if current_char:
key = f"(韵母){simplify(current_char)}"
result[key]["摘句"] += desc_content
else:
# It's a 词条
if current_char and word:
key = f"(韵母){simplify(current_char)}"
result[key]["词条"][word] = desc_content
return result

View File

@@ -1,65 +0,0 @@
import os
import time
import urllib.parse
import requests
from requests.exceptions import RequestException
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("missing_urls.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
for url in urls:
# Decode the URL to get the original characters
decoded_url = urllib.parse.unquote(url)
# Extract the volume name, e.g., 卷001之1
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
if os.path.exists(filename):
print(f"Skipping {filename}, already exists.")
continue
print(f"Downloading {volume_name} from {url}...")
success = False
retries = 3
while not success and retries > 0:
try:
response = requests.get(
url, headers=headers, proxies=proxies, timeout=15
)
if response.status_code == 200:
with open(filename, "w", encoding="utf-8") as out_f:
out_f.write(response.text)
success = True
print(f"Successfully downloaded {filename}")
else:
print(
f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
)
except RequestException as e:
print(f"Error downloading {url}: {e}. Retries left: {retries - 1}")
if not success:
retries -= 1
time.sleep(2) # Delay before retry
if not success:
print(f"Failed to download {url} after all retries.")
time.sleep(0.5) # Small delay between requests
if __name__ == "__main__":
main()

View File

@@ -1,65 +0,0 @@
import asyncio
import aiohttp
from aiohttp_socks import ProxyConnector, ProxyType
import os
import urllib.parse
import time
async def download_file(session, url, filename, semaphore):
async with semaphore:
if os.path.exists(filename):
print(f"Skipping {filename}")
return True
retries = 3
while retries > 0:
try:
async with session.get(url, timeout=30) as response:
if response.status == 200:
content = await response.read()
with open(filename, "wb") as f:
f.write(content)
print(f"Successfully downloaded {filename}")
return True
else:
print(f"HTTP Error {response.status} for {url}")
except Exception as e:
print(f"Error for {url}: {e}")
retries -= 1
await asyncio.sleep(2)
print(f"Failed all retries for {url}")
return False
async def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("missing_urls.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
# To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
# aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
proxy_url = "socks5://127.0.0.1:10808"
connector = ProxyConnector.from_url(proxy_url, rdns=True)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
semaphore = asyncio.Semaphore(10) # 10 concurrent downloads
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
tasks = []
for url in urls:
decoded_url = urllib.parse.unquote(url)
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
tasks.append(download_file(session, url, filename, semaphore))
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,65 +0,0 @@
import os
import time
import urllib.parse
import requests
from requests.exceptions import RequestException
import concurrent.futures
def download_url(url):
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
decoded_url = urllib.parse.unquote(url)
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
if os.path.exists(filename):
# Already exists
return True, filename
success = False
retries = 3
while not success and retries > 0:
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
if response.status_code == 200:
with open(filename, "w", encoding="utf-8") as out_f:
out_f.write(response.text)
success = True
print(f"Successfully downloaded {filename}")
return True, filename
else:
print(f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}")
except RequestException as e:
pass
if not success:
retries -= 1
time.sleep(2)
if not success:
print(f"Failed to download {url} after all retries.")
return False, filename
return False, filename
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("missing_urls.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
# Use ThreadPoolExecutor to download multiple files concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
futures = [executor.submit(download_url, url) for url in urls]
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"Exception during download: {e}")
if __name__ == "__main__":
main()

View File

@@ -1,79 +0,0 @@
import os
import time
import urllib.parse
import requests
from requests.exceptions import RequestException
import concurrent.futures
def download_url(url):
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
}
decoded_url = urllib.parse.unquote(url)
volume_name = decoded_url.split("/")[-1]
filename = f"html_files/{volume_name}.html"
if os.path.exists(filename):
return True, filename
success = False
retries = 5
while not success and retries > 0:
try:
response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
if response.status_code == 200:
with open(filename, "w", encoding="utf-8") as out_f:
out_f.write(response.text)
success = True
print(f"Successfully downloaded {filename}")
return True, filename
elif response.status_code == 403:
print(
f"HTTP Error 403 for {url}. Waiting longer to avoid rate limit..."
)
time.sleep(5)
else:
print(
f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
)
except RequestException as e:
pass
if not success:
retries -= 1
time.sleep(3)
if not success:
print(f"Failed to download {url} after all retries.")
return False, filename
return False, filename
def main():
if not os.path.exists("html_files"):
os.makedirs("html_files")
with open("still_missing.txt", "r", encoding="utf-8") as f:
urls = [line.strip() for line in f if line.strip()]
print(f"Starting download for {len(urls)} remaining files...")
# Use ThreadPoolExecutor to download multiple files concurrently
# Reduced max_workers to 5 to avoid triggering 403 Forbidden rate limits
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
futures = [executor.submit(download_url, url) for url in urls]
for future in concurrent.futures.as_completed(futures):
try:
future.result()
except Exception as e:
print(f"Exception during download: {e}")
if __name__ == "__main__":
main()

View File

@@ -1,96 +0,0 @@
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7070%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7077%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B02
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B03
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B04
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B05
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B06
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B07
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B08
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B09
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B10
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B9
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B010
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B9
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B010
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B011
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B012
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B4
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B5
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B6
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B7
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B8
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B1
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B2
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B3
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7106%E4%B9%8B2

View File

@@ -1,53 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
text = soup.find("div", class_="poem").get_text()
# Extract Tone and Rhyme
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
if tone_rhyme_m:
print("Tone:", tone_rhyme_m.group(1))
print("Rhyme:", tone_rhyme_m.group(2))
# Extract Volume
vol_m = re.search(r"卷(.)之(.)", text)
if vol_m:
print("Volume:", f"0{vol_m.group(1)}{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}{vol_m.group(2)}")
# Extract chars
lines = text.split('\n')
rhyme_chars = []
for i, line in enumerate(lines):
if tone_rhyme_m.group(2) in line:
# the next line usually has the characters
chars_line = lines[i+1]
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
break
print("Chars:", rhyme_chars)
# Now, we want to strip all lines that are headers.
# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..."
# We can just filter out these known header lines!
clean_lines = []
for line in lines:
stripped = line.strip()
if not stripped: continue
if stripped == "欽定四庫全書": continue
if stripped.startswith("御定佩文韻府"): continue
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
if "" in stripped: continue
# Is it the chars line?
if all(c in rhyme_chars or c in '  ' for c in stripped):
continue
clean_lines.append(stripped)
clean_text = "".join(clean_lines)
print("Start of clean text:", clean_text[:100])
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
print("First token:", tokens[0])

View File

@@ -1,10 +0,0 @@
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if poem_div:
for i, p in enumerate(poem_div.find_all("p")[:20]):
print(f"--- P {i} ---")
print(p.text[:100].replace('\n', ' '))

View File

@@ -1,12 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
if poem_div:
lines = poem_div.get_text().split("\n")
lines = [line.strip() for line in lines if line.strip()]
for i, line in enumerate(lines[:50]):
print(f"{i}: {line}")

View File

@@ -1,5 +0,0 @@
import re
text = "對語〈渭北 江東〉〈 平北 安東〉摘句〈力障百川東〉"
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
for i, (word, desc_blocks) in enumerate(tokens):
print(f"Token {i}: WORD='{word}' DESCS={desc_blocks}")

View File

@@ -1,54 +0,0 @@
import json
import re
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
prefixes = ["韻藻", ""]
def replace_pipes(content, word):
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
break # only strip one prefix
word_len = len(clean_word)
if word_len == 0:
return content.replace("", "")
result = []
pipe_idx = 0
chars_since_last_pipe = 0
for char in content:
if char == "":
if chars_since_last_pipe >= 5:
# Long gap -> reset pipe_idx!
# Wait, only reset if we aren't in the middle of a perfect mapping?
# Actually, if the gap is >=5, it's definitely a new occurrence.
pipe_idx = 0
result.append(clean_word[pipe_idx % word_len])
pipe_idx += 1
chars_since_last_pipe = 0
else:
result.append(char)
chars_since_last_pipe += 1
return "".join(result)
# Test specific words
test_cases = [
("首陽東", "詩采葑采葑丨丨之丨"),
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
]
for w, c in test_cases:
print(f"Word: {w}")
print(f"Orig: {c}")
print(f"Fix : {replace_pipes(c, w)}")
print("-" * 40)

View File

@@ -1,37 +0,0 @@
import json
prefixes = ["韻藻", ""]
def replace_pipes_no_reset(content, word):
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
break
word_len = len(clean_word)
if word_len == 0:
return content.replace("", "")
result = []
pipe_idx = 0
for char in content:
if char == "":
result.append(clean_word[pipe_idx % word_len])
pipe_idx += 1
else:
result.append(char)
return "".join(result)
test_cases = [
("首陽東", "詩采葑采葑丨丨之丨"),
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
]
for w, c in test_cases:
print(f"Word: {w}")
print(f"NoRst: {replace_pipes_no_reset(c, w)}")
print("-" * 40)

View File

@@ -1,56 +0,0 @@
import json
import re
prefixes = ["韻藻", ""]
def replace_pipes_hybrid(content, word):
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
break
word_len = len(clean_word)
if word_len == 0:
return content.replace("", "")
def repl(match):
nonlocal pipe_idx
block = match.group(0)
block_len = len(block)
if block_len % word_len == 0:
# Full word match! Reset alignment.
pipe_idx = 0
return clean_word * (block_len // word_len)
else:
# Partial word match. Use current sequence.
res = ""
for _ in range(block_len):
res += clean_word[pipe_idx % word_len]
pipe_idx += 1
return res
pipe_idx = 0
return re.sub(r'丨+', repl, content)
test_cases = [
("首陽東", "詩采葑采葑丨丨之丨"),
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
]
for w, c in test_cases:
print(f"Word: {w}")
print(f"Orig: {c}")
print(f"Hybr: {replace_pipes_hybrid(c, w)}")
print("-" * 40)
test_cases.append(("紫殿東", "時魚躍丨丨丨温庭筠詩一夕丨丨"))
for w, c in test_cases[-1:]:
print(f"Word: {w}")
print(f"Orig: {c}")
print(f"Hybr: {replace_pipes_hybrid(c, w)}")
print("-" * 40)

View File

@@ -1,31 +0,0 @@
import json
from parser import parse_html
def test_parse_html():
file_path = "html_files/卷001之1.html"
result = parse_html(file_path)
# Save for manual inspection
with open("output.json", "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Check that it returns a dictionary
assert isinstance(result, dict)
# Let's see what keys are in it
keys = list(result.keys())
print("Keys found:", keys)
if len(keys) > 0:
first_key = keys[0]
assert "" in result[first_key]
assert "" in result[first_key]
assert "" in result[first_key]
assert "小韵描述" in result[first_key]
assert "词条" in result[first_key]
assert "对语" in result[first_key]
assert "摘句" in result[first_key]
if __name__ == "__main__":
test_parse_html()
print("Tests passed!")

View File

@@ -1,49 +0,0 @@
import json
import re
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# Analyze a few
count_match = 0
count_mismatch = 0
for rhyme, r_data in list(data.items())[:5]: # Skip metadata, preface
if rhyme in ["metadata", "preface"]:
continue
print(f"\nRhyme: {rhyme}")
# 1. 小韵描述
desc = r_data.get("小韵描述", "")
desc_fixed = desc.replace("", rhyme)
print(f"Desc original: {desc[:30]}...")
print(f"Desc fixed: {desc_fixed[:30]}...")
# 2. 词条
for word, content in list(r_data.get("词条", {}).items())[:5]:
pipe_count = content.count("")
word_len = len(word)
if pipe_count == 0:
continue
print(f"Word: {word} (len {word_len}), pipes: {pipe_count}")
print(f"Original: {content}")
# Test replacing
if pipe_count % word_len == 0:
# We can replace them in groups of word_len
fixed_content = ""
pipe_idx = 0
for char in content:
if char == "":
fixed_content += word[pipe_idx % word_len]
pipe_idx += 1
else:
fixed_content += char
print(f"Fixed: {fixed_content}")
count_match += 1
else:
print("MISMATCH length!")
count_mismatch += 1
print(f"\nMatches: {count_match}, Mismatches: {count_mismatch}")

View File

@@ -1,46 +0,0 @@
import asyncio
from playwright.async_api import async_playwright
import urllib.parse
import sys
async def main():
url = sys.argv[1]
proxy_server = "socks5://127.0.0.1:10808"
async with async_playwright() as p:
# Try with proxy
browser = await p.chromium.launch(headless=True, proxy={"server": proxy_server})
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
page = await context.new_page()
try:
print(f"Loading {url} via proxy...")
await page.goto(url, timeout=30000, wait_until="domcontentloaded")
content = await page.content()
print(f"Success! Content length: {len(content)}")
except Exception as e:
print(f"Error via proxy: {e}")
await browser.close()
# Try without proxy
print("Retrying without proxy...")
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
page = await context.new_page()
try:
await page.goto(url, timeout=30000, wait_until="domcontentloaded")
content = await page.content()
print(f"Success! Content length: {len(content)}")
except Exception as e2:
print(f"Error without proxy: {e2}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,21 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
mismatches = []
prefixes_found = set()
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
for word, content in r_data.get("词条", {}).items():
pipe_count = content.count("")
if pipe_count == 0:
continue
if pipe_count % len(word) != 0:
mismatches.append((word, pipe_count))
print(f"Total mismatches: {len(mismatches)}")
for w, p in mismatches[:20]:
print(f"{w} (len {len(w)}), pipes: {p}")

View File

@@ -1,13 +0,0 @@
import json
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
for word, content in r_data.get("词条", {}).items():
if word in ["紫殿東", "少微東", "東海東", "日夜東", "隔西東"]:
print(f"Word: {word}")
print(f"Content: {content}")
print("-" * 40)

View File

@@ -1,18 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
poem_div = soup.find("div", class_="poem")
text = poem_div.get_text()
# Extract the list of characters.
# It appears after "一東韻一" or similar.
m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text)
if m:
print("Tone:", m.group(1))
print("Rhyme:", m.group(2))
print("Chars line:", m.group(3))
rhyme_chars = [c for c in m.group(3).replace(' ', ' ').split() if len(c) == 1]
print("Chars:", rhyme_chars)

View File

@@ -1,32 +0,0 @@
import json
import re
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
data = json.load(f)
prefixes = ["韻藻", ""]
mismatches = []
total_pipes = 0
for rhyme, r_data in data.items():
if rhyme in ["metadata", "preface"]:
continue
for word, content in r_data.get("词条", {}).items():
clean_word = word
for p in prefixes:
if clean_word.startswith(p) and len(clean_word) > len(p):
clean_word = clean_word[len(p):]
pipe_count = content.count("")
if pipe_count == 0:
continue
total_pipes += 1
if pipe_count % len(clean_word) != 0:
mismatches.append((word, clean_word, pipe_count, content))
print(f"Total entries with pipes: {total_pipes}")
print(f"Total mismatches after stripping: {len(mismatches)}")
for w, cw, p, c in mismatches[:10]:
print(f"{w} -> {cw} (len {len(cw)}), pipes: {p}")
print(f" {c}")

View File

@@ -1,21 +0,0 @@
import re
from bs4 import BeautifulSoup
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
text = soup.find("div", class_="poem").get_text().replace('\n', '')
# Remove header junk for this test (just find the first '東〈')
start_idx = text.find('東〈')
text = text[start_idx:]
# Tokenize into pairs of (Word, Description)
# Using regex to find all Word〈Description〉
# Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉
# We can find all chunks of non-〈 characters, followed by one or more 〈...〉
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
for i, (word, desc_blocks) in enumerate(tokens[:20]):
print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...")