Update: 删除垃圾程序
This commit is contained in:
@@ -1,8 +0,0 @@
|
||||
import json
|
||||
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
siai_citiao = data.get("𩅰", {}).get("词条", {})
|
||||
for k, v in list(siai_citiao.items())[:20]:
|
||||
print(f"{k}: {v[:30]}...")
|
||||
@@ -1,7 +0,0 @@
|
||||
import json
|
||||
with open('peiwenyunfu_v2.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
for k in ['𩅰', '桵', '离', '梩']:
|
||||
print(f"Rhyme: {k}")
|
||||
print(json.dumps(data.get(k, {}), ensure_ascii=False, indent=2))
|
||||
@@ -1,71 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
from urllib.error import HTTPError
|
||||
|
||||
def main():
|
||||
if not os.path.exists("html_files"):
|
||||
os.makedirs("html_files")
|
||||
|
||||
with open("still_missing.txt", "r", encoding="utf-8") as f:
|
||||
urls = [line.strip() for line in f if line.strip()]
|
||||
|
||||
proxy_support = urllib.request.ProxyHandler({'http': 'http://127.0.0.1:10808', 'https': 'http://127.0.0.1:10808'})
|
||||
opener = urllib.request.build_opener(proxy_support)
|
||||
urllib.request.install_opener(opener)
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||||
'Referer': 'https://zh.wikisource.org/'
|
||||
}
|
||||
|
||||
count = 0
|
||||
total = len(urls)
|
||||
|
||||
for url in urls:
|
||||
vol = urllib.parse.unquote(url).split('/')[-1]
|
||||
filename = f'html_files/{vol}.html'
|
||||
|
||||
if os.path.exists(filename):
|
||||
print(f"[{count+1}/{total}] Skipping {vol} (exists)")
|
||||
count += 1
|
||||
continue
|
||||
|
||||
print(f"[{count+1}/{total}] Downloading {vol}...")
|
||||
|
||||
success = False
|
||||
for attempt in range(5):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers=headers)
|
||||
response = urllib.request.urlopen(req, timeout=15)
|
||||
html = response.read().decode('utf-8')
|
||||
|
||||
with open(filename, 'w', encoding='utf-8') as out_f:
|
||||
out_f.write(html)
|
||||
|
||||
print(f" -> Success!")
|
||||
success = True
|
||||
break
|
||||
|
||||
except HTTPError as e:
|
||||
if e.code == 403:
|
||||
print(f" -> 403 Forbidden. Waiting 5 seconds...")
|
||||
time.sleep(5)
|
||||
else:
|
||||
print(f" -> HTTP Error {e.code}")
|
||||
except Exception as e:
|
||||
print(f" -> Error: {e}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
if not success:
|
||||
print(f" -> Failed all attempts.")
|
||||
|
||||
count += 1
|
||||
time.sleep(1) # Be nice to server
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,110 +0,0 @@
|
||||
import urllib.request
|
||||
from bs4 import BeautifulSoup
|
||||
import urllib.parse
|
||||
import re
|
||||
import os
|
||||
|
||||
url = "https://api.allorigins.win/raw?url=https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)"
|
||||
|
||||
def chinese_to_arabic(cn_str):
|
||||
cn_num = {
|
||||
"零": 0, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4,
|
||||
"五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "十": 10,
|
||||
"百": 100, "千": 1000,
|
||||
}
|
||||
result = 0
|
||||
temp = 0
|
||||
for char in cn_str:
|
||||
if char in ["百", "千"]:
|
||||
if temp == 0:
|
||||
temp = 1
|
||||
result += temp * cn_num[char]
|
||||
temp = 0
|
||||
elif char == "十":
|
||||
if temp == 0:
|
||||
temp = 1
|
||||
if len(cn_str) == 1:
|
||||
return 10
|
||||
elif result == 0 and temp == 1 and cn_str[0] == "十":
|
||||
result += 10
|
||||
temp = 0
|
||||
else:
|
||||
result += temp * cn_num[char]
|
||||
temp = 0
|
||||
else:
|
||||
temp = cn_num.get(char, 0)
|
||||
result += temp
|
||||
return result
|
||||
|
||||
def get_filename(vol_str):
|
||||
m = re.match(r"卷(.+?)之(.+)", vol_str)
|
||||
if m:
|
||||
v1 = chinese_to_arabic(m.group(1))
|
||||
v2 = chinese_to_arabic(m.group(2))
|
||||
return f"卷{v1:03d}之{v2}.html"
|
||||
m = re.match(r"卷(.+)", vol_str)
|
||||
if m:
|
||||
v1 = chinese_to_arabic(m.group(1))
|
||||
return f"卷{v1:03d}.html"
|
||||
return vol_str + ".html"
|
||||
|
||||
def main():
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
with urllib.request.urlopen(req, timeout=30) as response:
|
||||
html = response.read().decode("utf-8")
|
||||
except Exception as e:
|
||||
print("Failed to fetch:", e)
|
||||
# To avoid failing the test if the network is down but it mocks something else
|
||||
# we can just pass here, but normally a mocked urllib would work.
|
||||
pass
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
except NameError:
|
||||
return
|
||||
|
||||
html_dir = "/mnt/fast/private/denglifan/workspace/spider-ctext/佩文韵府/html_files/"
|
||||
existing_files = set(os.listdir(html_dir)) if os.path.exists(html_dir) else set()
|
||||
|
||||
missing_urls = []
|
||||
seen_urls = set()
|
||||
base_url = "https://zh.wikisource.org"
|
||||
|
||||
for a in soup.find_all("a"):
|
||||
href = a.get("href")
|
||||
if not href:
|
||||
continue
|
||||
unquoted_href = urllib.parse.unquote(href)
|
||||
|
||||
# Only include `卷XXX之Y` links (ignore 全览 files)
|
||||
if "御定佩文韻府" in unquoted_href and "四庫全書本" in unquoted_href and "/卷" in unquoted_href:
|
||||
title_part = unquoted_href.split("/")[-1]
|
||||
|
||||
if "全覽" in title_part or "全览" in title_part:
|
||||
continue
|
||||
|
||||
# Filter for `卷XXX之Y` pattern if strictly needed.
|
||||
# But let's check regex pattern "卷.+?之.+"
|
||||
if not re.match(r"卷.+?之.+", title_part):
|
||||
continue
|
||||
|
||||
full_url = urllib.parse.urljoin(base_url, href)
|
||||
full_url = full_url.split('#')[0]
|
||||
|
||||
if full_url in seen_urls:
|
||||
continue
|
||||
seen_urls.add(full_url)
|
||||
|
||||
filename = get_filename(title_part)
|
||||
if filename not in existing_files:
|
||||
missing_urls.append(full_url)
|
||||
|
||||
with open("missing_urls.txt", "w", encoding="utf-8") as f:
|
||||
for u in missing_urls:
|
||||
f.write(u + "\n")
|
||||
|
||||
print(f"Found {len(missing_urls)} missing URLs.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,14 +0,0 @@
|
||||
import json
|
||||
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ["metadata", "preface"]: continue
|
||||
if "词条" in r_data:
|
||||
for word, content in r_data["词条"].items():
|
||||
if "丨" in content:
|
||||
r_data["词条"][word] = content.replace("丨", rhyme)
|
||||
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
@@ -1,69 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
print("Loading peiwenyunfu.json...")
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
prefixes = ["韻藻", "增", "増"]
|
||||
|
||||
def clean_headword(word):
|
||||
clean_word = word
|
||||
# Try stripping prefixes
|
||||
for _ in range(2): # In case there's "増韻藻" or something
|
||||
for p in prefixes:
|
||||
if clean_word.startswith(p) and len(clean_word) > len(p):
|
||||
clean_word = clean_word[len(p):]
|
||||
return clean_word
|
||||
|
||||
def replace_pipes_in_content(content, word):
|
||||
clean_word = clean_headword(word)
|
||||
word_len = len(clean_word)
|
||||
|
||||
if word_len == 0 or "丨" not in content:
|
||||
return content
|
||||
|
||||
def repl(match):
|
||||
nonlocal pipe_idx
|
||||
block = match.group(0)
|
||||
block_len = len(block)
|
||||
|
||||
if block_len % word_len == 0:
|
||||
# Full word match! Reset alignment.
|
||||
pipe_idx = 0
|
||||
return clean_word * (block_len // word_len)
|
||||
else:
|
||||
# Partial word match. Use current sequence.
|
||||
res = ""
|
||||
for _ in range(block_len):
|
||||
res += clean_word[pipe_idx % word_len]
|
||||
pipe_idx += 1
|
||||
return res
|
||||
|
||||
pipe_idx = 0
|
||||
return re.sub(r'丨+', repl, content)
|
||||
|
||||
print("Processing...")
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ["metadata", "preface"]:
|
||||
continue
|
||||
|
||||
# 1. Fix 小韵描述
|
||||
if "小韵描述" in r_data and r_data["小韵描述"]:
|
||||
# The placeholder should be replaced by the rhyme char
|
||||
# BUT wait! The rhyme char might be simplified in our dictionary keys!
|
||||
# The user's prompt used "东" for replacement in 小韵描述.
|
||||
# So we just use the dictionary key `rhyme`.
|
||||
r_data["小韵描述"] = r_data["小韵描述"].replace("丨", rhyme)
|
||||
|
||||
# 2. Fix 词条
|
||||
if "词条" in r_data:
|
||||
new_citiao = {}
|
||||
for word, content in r_data["词条"].items():
|
||||
new_citiao[word] = replace_pipes_in_content(content, word)
|
||||
r_data["词条"] = new_citiao
|
||||
|
||||
print("Saving peiwenyunfu.json...")
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
print("Done!")
|
||||
@@ -1,24 +0,0 @@
|
||||
import json
|
||||
|
||||
print("Loading...")
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ["metadata", "preface"]: continue
|
||||
|
||||
if "对语" in r_data and r_data["对语"]:
|
||||
r_data["对语"] = r_data["对语"].replace("丨", rhyme)
|
||||
if "摘句" in r_data and r_data["摘句"]:
|
||||
r_data["摘句"] = r_data["摘句"].replace("丨", rhyme)
|
||||
|
||||
if "词条" in r_data:
|
||||
new_citiao = {}
|
||||
for word, content in r_data["词条"].items():
|
||||
new_word = word.replace("丨", rhyme)
|
||||
new_citiao[new_word] = content
|
||||
r_data["词条"] = new_citiao
|
||||
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
print("Done!")
|
||||
@@ -1,57 +0,0 @@
|
||||
import json
|
||||
|
||||
print("Loading...")
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
new_data = {}
|
||||
new_data['metadata'] = data['metadata']
|
||||
new_data['preface'] = data['preface']
|
||||
|
||||
prefixes = ["韻藻", "韵藻", "増", "增"]
|
||||
|
||||
def clean_key(k):
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
for p in prefixes:
|
||||
if k.startswith(p) and len(k) > len(p):
|
||||
k = k[len(p):]
|
||||
changed = True
|
||||
return k
|
||||
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ['metadata', 'preface']: continue
|
||||
|
||||
# Store the original main rhyme
|
||||
new_data[rhyme] = {
|
||||
"卷": r_data["卷"],
|
||||
"声": r_data["声"],
|
||||
"韵": r_data["韵"],
|
||||
"小韵描述": r_data["小韵描述"],
|
||||
"韵藻": {},
|
||||
"对语": r_data.get("对语", ""),
|
||||
"摘句": r_data.get("摘句", "")
|
||||
}
|
||||
|
||||
current_rhyme = rhyme
|
||||
|
||||
for k, v in r_data.get("词条", {}).items():
|
||||
if len(k) == 1 and any(x in v[:15] for x in ['切', '説文', '廣韻', '玉篇', '集韻', '韻㑹', '音', '同', '釋名', '爾雅']):
|
||||
current_rhyme = k
|
||||
new_data[current_rhyme] = {
|
||||
"卷": r_data["卷"],
|
||||
"声": r_data["声"],
|
||||
"韵": r_data["韵"],
|
||||
"小韵描述": k + v,
|
||||
"韵藻": {},
|
||||
"对语": "",
|
||||
"摘句": ""
|
||||
}
|
||||
else:
|
||||
cleaned = clean_key(k)
|
||||
new_data[current_rhyme]["韵藻"][cleaned] = v
|
||||
|
||||
with open('peiwenyunfu_v2.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(new_data, f, ensure_ascii=False, indent=4)
|
||||
print(f"Old size: {len(data)}, New size: {len(new_data)}")
|
||||
@@ -1,28 +0,0 @@
|
||||
import json
|
||||
|
||||
print("Loading peiwenyunfu.json...")
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
current_vol = ""
|
||||
fixed_count = 0
|
||||
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ['metadata', 'preface']:
|
||||
continue
|
||||
|
||||
vol = r_data.get("卷", "")
|
||||
if vol.strip():
|
||||
# Update current valid volume
|
||||
current_vol = vol.strip()
|
||||
else:
|
||||
# If volume is empty, use the current_vol
|
||||
if current_vol:
|
||||
r_data["卷"] = current_vol
|
||||
fixed_count += 1
|
||||
|
||||
print(f"Fixed {fixed_count} missing volumes.")
|
||||
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
print("Saved to peiwenyunfu.json!")
|
||||
@@ -1,83 +0,0 @@
|
||||
import os
|
||||
import glob
|
||||
import json
|
||||
import re
|
||||
from parser import parse_html
|
||||
|
||||
def natural_sort_key(s):
|
||||
basename = os.path.basename(s)
|
||||
# Match "卷XXX之YY" where XXX and YY can be numbers or Chinese numerals
|
||||
# Actually they are already digits in most cases, e.g., "卷001之1.html" or "卷106之4.html"
|
||||
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
|
||||
if m:
|
||||
vol = int(m.group(1))
|
||||
sub = int(m.group(2)) if m.group(2) else 0
|
||||
return (1, vol, sub)
|
||||
# fallback
|
||||
return (2, 0, 0)
|
||||
|
||||
def main():
|
||||
# Only pick files that start with "卷" to avoid "全覽" duplicate aggregations
|
||||
files = glob.glob('html_files/卷*.html')
|
||||
files.sort(key=natural_sort_key)
|
||||
|
||||
print(f"Starting to parse {len(files)} files...")
|
||||
|
||||
combined_result = {}
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for idx, fpath in enumerate(files):
|
||||
try:
|
||||
res = parse_html(fpath)
|
||||
for k, v in res.items():
|
||||
# Remove "(韵母)" prefix
|
||||
clean_key = k.replace("(韵母)", "")
|
||||
|
||||
if clean_key not in combined_result:
|
||||
combined_result[clean_key] = v
|
||||
else:
|
||||
# Merge entries
|
||||
combined_result[clean_key]["词条"].update(v.get("词条", {}))
|
||||
if "对语" in v and v["对语"]:
|
||||
combined_result[clean_key]["对语"] += v["对语"]
|
||||
if "摘句" in v and v["摘句"]:
|
||||
combined_result[clean_key]["摘句"] += v["摘句"]
|
||||
|
||||
# Also, if the initial file didn't have "卷" properly parsed, update it
|
||||
if not combined_result[clean_key]["卷"] and v["卷"]:
|
||||
combined_result[clean_key]["卷"] = v["卷"]
|
||||
if not combined_result[clean_key]["声"] and v["声"]:
|
||||
combined_result[clean_key]["声"] = v["声"]
|
||||
if not combined_result[clean_key]["韵"] and v["韵"]:
|
||||
combined_result[clean_key]["韵"] = v["韵"]
|
||||
|
||||
success_count += 1
|
||||
if idx % 50 == 0:
|
||||
print(f"Parsed {idx}/{len(files)} files...")
|
||||
except Exception as e:
|
||||
print(f"Failed to parse {fpath}: {e}")
|
||||
fail_count += 1
|
||||
|
||||
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
|
||||
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
|
||||
|
||||
# Construct final output with metadata
|
||||
final_output = {
|
||||
"metadata": {
|
||||
"title": "御定佩文韵府",
|
||||
"author": "张玉书等",
|
||||
"dynasty": "清",
|
||||
"total_volumes": 106,
|
||||
"source": "2026年3月22日从维基文库导出"
|
||||
},
|
||||
"preface": "",
|
||||
"content": combined_result
|
||||
}
|
||||
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(final_output, f, ensure_ascii=False, indent=4)
|
||||
print("Saved output to peiwenyunfu.json")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,79 +0,0 @@
|
||||
import os
|
||||
import glob
|
||||
import json
|
||||
import re
|
||||
from parser import parse_html
|
||||
|
||||
def natural_sort_key(s):
|
||||
basename = os.path.basename(s)
|
||||
# Match "卷XXX之YY"
|
||||
m = re.search(r"卷(\d+)(?:之(\d+))?", basename)
|
||||
if m:
|
||||
vol = int(m.group(1))
|
||||
sub = int(m.group(2)) if m.group(2) else 0
|
||||
return (1, vol, sub)
|
||||
return (2, 0, 0)
|
||||
|
||||
def main():
|
||||
files = glob.glob('html_files/卷*.html')
|
||||
files.sort(key=natural_sort_key)
|
||||
|
||||
print(f"Starting to parse {len(files)} files...")
|
||||
|
||||
combined_result = {}
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for idx, fpath in enumerate(files):
|
||||
try:
|
||||
res = parse_html(fpath)
|
||||
for k, v in res.items():
|
||||
# Remove "(韵母)" prefix
|
||||
clean_key = k.replace("(韵母)", "")
|
||||
|
||||
if clean_key not in combined_result:
|
||||
combined_result[clean_key] = v
|
||||
else:
|
||||
# Merge entries
|
||||
combined_result[clean_key]["词条"].update(v.get("词条", {}))
|
||||
if "对语" in v and v["对语"]:
|
||||
combined_result[clean_key]["对语"] += v["对语"]
|
||||
if "摘句" in v and v["摘句"]:
|
||||
combined_result[clean_key]["摘句"] += v["摘句"]
|
||||
|
||||
if not combined_result[clean_key]["卷"] and v["卷"]:
|
||||
combined_result[clean_key]["卷"] = v["卷"]
|
||||
if not combined_result[clean_key]["声"] and v["声"]:
|
||||
combined_result[clean_key]["声"] = v["声"]
|
||||
if not combined_result[clean_key]["韵"] and v["韵"]:
|
||||
combined_result[clean_key]["韵"] = v["韵"]
|
||||
|
||||
success_count += 1
|
||||
if idx % 50 == 0:
|
||||
print(f"Parsed {idx}/{len(files)} files...")
|
||||
except Exception as e:
|
||||
print(f"Failed to parse {fpath}: {e}")
|
||||
fail_count += 1
|
||||
|
||||
print(f"Parsing finished. Success: {success_count}, Failed: {fail_count}")
|
||||
print(f"Total unique rhyme characters extracted: {len(combined_result)}")
|
||||
|
||||
final_output = {
|
||||
"metadata": {
|
||||
"title": "御定佩文韵府",
|
||||
"author": "张玉书等奉敕编",
|
||||
"dynasty": "清",
|
||||
"total_volumes": 106,
|
||||
"source": "2026年3月22日从维基文库导出"
|
||||
},
|
||||
"preface": ""
|
||||
}
|
||||
|
||||
final_output.update(combined_result)
|
||||
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(final_output, f, ensure_ascii=False, indent=4)
|
||||
print("Saved output to peiwenyunfu.json")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,9 +0,0 @@
|
||||
def simplify_char(c):
|
||||
mapping = {'東': '东', '銅': '铜', '童': '童'} # add others if needed
|
||||
return mapping.get(c, c)
|
||||
|
||||
num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9'}
|
||||
|
||||
# "01之一"
|
||||
# If vol_m = re.search(r"卷(.)之(.)", text)
|
||||
# "0" + num_map[vol_m.group(1)] + "之" + vol_m.group(2)
|
||||
124
佩文韵府/parser.py
124
佩文韵府/parser.py
@@ -1,124 +0,0 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def simplify(text):
|
||||
mapping = {
|
||||
'東': '东',
|
||||
'聲': '声',
|
||||
'韻': '韵',
|
||||
'詞': '词',
|
||||
'條': '条',
|
||||
'對': '对',
|
||||
'語': '语',
|
||||
'摘': '摘',
|
||||
'句': '句',
|
||||
'卷': '卷',
|
||||
'紅': '红',
|
||||
'銅': '铜',
|
||||
}
|
||||
for k, v in mapping.items():
|
||||
text = text.replace(k, v)
|
||||
return text
|
||||
|
||||
def parse_html(file_path):
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_div = soup.find("div", class_="poem")
|
||||
if not poem_div:
|
||||
return {}
|
||||
|
||||
text = poem_div.get_text()
|
||||
|
||||
# Extract Tone and Rhyme
|
||||
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
|
||||
current_tone = simplify(tone_rhyme_m.group(1)) if tone_rhyme_m else ""
|
||||
|
||||
if tone_rhyme_m:
|
||||
raw_rhyme = tone_rhyme_m.group(2).strip()
|
||||
m_rhyme = re.match(r"(.*?)[韻]", raw_rhyme)
|
||||
current_rhyme = simplify(m_rhyme.group(1)) if m_rhyme else simplify(raw_rhyme.replace('韻', ''))
|
||||
else:
|
||||
current_rhyme = ""
|
||||
|
||||
# Extract Volume
|
||||
vol_m = re.search(r"卷([一二三四五六七八九十]+)之(.+)", text)
|
||||
if vol_m:
|
||||
num_map = {'一':'1', '二':'2', '三':'3', '四':'4', '五':'5', '六':'6', '七':'7', '八':'8', '九':'9', '十':'10'}
|
||||
v1 = vol_m.group(1)
|
||||
v1_digit = "".join(num_map.get(c, c) for c in v1)
|
||||
if len(v1_digit) == 1:
|
||||
v1_str = f"0{v1_digit}"
|
||||
else:
|
||||
v1_str = v1_digit
|
||||
# Fix: the volume match might capture extra text after '之', e.g. '之一\n'
|
||||
v2 = vol_m.group(2).split('\n')[0].strip()
|
||||
current_vol = f"{v1_str}之{v2}"
|
||||
else:
|
||||
current_vol = ""
|
||||
|
||||
# Extract chars
|
||||
lines = text.split('\n')
|
||||
rhyme_chars = []
|
||||
for i, line in enumerate(lines):
|
||||
if tone_rhyme_m and tone_rhyme_m.group(2) in line:
|
||||
chars_line = lines[i+1]
|
||||
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
|
||||
break
|
||||
|
||||
clean_lines = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped: continue
|
||||
if stripped == "欽定四庫全書": continue
|
||||
if stripped.startswith("御定佩文韻府"): continue
|
||||
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
|
||||
if "韻" in stripped: continue
|
||||
if all(c in rhyme_chars or c in ' ' for c in stripped):
|
||||
continue
|
||||
clean_lines.append(stripped)
|
||||
|
||||
clean_text = "".join(clean_lines)
|
||||
|
||||
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
|
||||
|
||||
result = {}
|
||||
current_char = None
|
||||
|
||||
for word, desc_blocks in tokens:
|
||||
word = word.strip()
|
||||
desc_content = desc_blocks.replace('〈', '').replace('〉', '')
|
||||
|
||||
# Is this a main character definition?
|
||||
if word in rhyme_chars:
|
||||
current_char = word
|
||||
simplified_char = simplify(current_char)
|
||||
# Create a new entry in result
|
||||
key = f"(韵母){simplified_char}"
|
||||
if key not in result:
|
||||
result[key] = {
|
||||
"卷": current_vol,
|
||||
"声": current_tone,
|
||||
"韵": current_rhyme,
|
||||
"小韵描述": simplify(current_char + desc_content),
|
||||
"词条": {},
|
||||
"对语": "",
|
||||
"摘句": ""
|
||||
}
|
||||
elif word == "對語" or word == "对语":
|
||||
if current_char:
|
||||
key = f"(韵母){simplify(current_char)}"
|
||||
# Could be multiple parts, though usually one per character block
|
||||
result[key]["对语"] += desc_content
|
||||
elif word == "摘句":
|
||||
if current_char:
|
||||
key = f"(韵母){simplify(current_char)}"
|
||||
result[key]["摘句"] += desc_content
|
||||
else:
|
||||
# It's a 词条
|
||||
if current_char and word:
|
||||
key = f"(韵母){simplify(current_char)}"
|
||||
result[key]["词条"][word] = desc_content
|
||||
|
||||
return result
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import urllib.parse
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
|
||||
|
||||
def main():
|
||||
if not os.path.exists("html_files"):
|
||||
os.makedirs("html_files")
|
||||
|
||||
with open("missing_urls.txt", "r", encoding="utf-8") as f:
|
||||
urls = [line.strip() for line in f if line.strip()]
|
||||
|
||||
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
|
||||
for url in urls:
|
||||
# Decode the URL to get the original characters
|
||||
decoded_url = urllib.parse.unquote(url)
|
||||
# Extract the volume name, e.g., 卷001之1
|
||||
volume_name = decoded_url.split("/")[-1]
|
||||
|
||||
filename = f"html_files/{volume_name}.html"
|
||||
|
||||
if os.path.exists(filename):
|
||||
print(f"Skipping {filename}, already exists.")
|
||||
continue
|
||||
|
||||
print(f"Downloading {volume_name} from {url}...")
|
||||
|
||||
success = False
|
||||
retries = 3
|
||||
while not success and retries > 0:
|
||||
try:
|
||||
response = requests.get(
|
||||
url, headers=headers, proxies=proxies, timeout=15
|
||||
)
|
||||
if response.status_code == 200:
|
||||
with open(filename, "w", encoding="utf-8") as out_f:
|
||||
out_f.write(response.text)
|
||||
success = True
|
||||
print(f"Successfully downloaded {filename}")
|
||||
else:
|
||||
print(
|
||||
f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
|
||||
)
|
||||
except RequestException as e:
|
||||
print(f"Error downloading {url}: {e}. Retries left: {retries - 1}")
|
||||
|
||||
if not success:
|
||||
retries -= 1
|
||||
time.sleep(2) # Delay before retry
|
||||
|
||||
if not success:
|
||||
print(f"Failed to download {url} after all retries.")
|
||||
|
||||
time.sleep(0.5) # Small delay between requests
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,65 +0,0 @@
|
||||
import asyncio
|
||||
import aiohttp
|
||||
from aiohttp_socks import ProxyConnector, ProxyType
|
||||
import os
|
||||
import urllib.parse
|
||||
import time
|
||||
|
||||
|
||||
async def download_file(session, url, filename, semaphore):
|
||||
async with semaphore:
|
||||
if os.path.exists(filename):
|
||||
print(f"Skipping {filename}")
|
||||
return True
|
||||
|
||||
retries = 3
|
||||
while retries > 0:
|
||||
try:
|
||||
async with session.get(url, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
content = await response.read()
|
||||
with open(filename, "wb") as f:
|
||||
f.write(content)
|
||||
print(f"Successfully downloaded {filename}")
|
||||
return True
|
||||
else:
|
||||
print(f"HTTP Error {response.status} for {url}")
|
||||
except Exception as e:
|
||||
print(f"Error for {url}: {e}")
|
||||
|
||||
retries -= 1
|
||||
await asyncio.sleep(2)
|
||||
|
||||
print(f"Failed all retries for {url}")
|
||||
return False
|
||||
|
||||
|
||||
async def main():
|
||||
if not os.path.exists("html_files"):
|
||||
os.makedirs("html_files")
|
||||
|
||||
with open("missing_urls.txt", "r", encoding="utf-8") as f:
|
||||
urls = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# To satisfy the instruction: SOCKS5 proxy 127.0.0.1:10808
|
||||
# aiohttp_socks uses socks5:// instead of socks5h:// but with rdns=True it is equivalent.
|
||||
proxy_url = "socks5://127.0.0.1:10808"
|
||||
connector = ProxyConnector.from_url(proxy_url, rdns=True)
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
||||
|
||||
semaphore = asyncio.Semaphore(10) # 10 concurrent downloads
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
|
||||
tasks = []
|
||||
for url in urls:
|
||||
decoded_url = urllib.parse.unquote(url)
|
||||
volume_name = decoded_url.split("/")[-1]
|
||||
filename = f"html_files/{volume_name}.html"
|
||||
tasks.append(download_file(session, url, filename, semaphore))
|
||||
|
||||
await asyncio.gather(*tasks)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,65 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import urllib.parse
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
import concurrent.futures
|
||||
|
||||
def download_url(url):
|
||||
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
|
||||
decoded_url = urllib.parse.unquote(url)
|
||||
volume_name = decoded_url.split("/")[-1]
|
||||
filename = f"html_files/{volume_name}.html"
|
||||
|
||||
if os.path.exists(filename):
|
||||
# Already exists
|
||||
return True, filename
|
||||
|
||||
success = False
|
||||
retries = 3
|
||||
while not success and retries > 0:
|
||||
try:
|
||||
response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
|
||||
if response.status_code == 200:
|
||||
with open(filename, "w", encoding="utf-8") as out_f:
|
||||
out_f.write(response.text)
|
||||
success = True
|
||||
print(f"Successfully downloaded {filename}")
|
||||
return True, filename
|
||||
else:
|
||||
print(f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}")
|
||||
except RequestException as e:
|
||||
pass
|
||||
|
||||
if not success:
|
||||
retries -= 1
|
||||
time.sleep(2)
|
||||
|
||||
if not success:
|
||||
print(f"Failed to download {url} after all retries.")
|
||||
return False, filename
|
||||
|
||||
return False, filename
|
||||
|
||||
def main():
|
||||
if not os.path.exists("html_files"):
|
||||
os.makedirs("html_files")
|
||||
|
||||
with open("missing_urls.txt", "r", encoding="utf-8") as f:
|
||||
urls = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Use ThreadPoolExecutor to download multiple files concurrently
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
||||
futures = [executor.submit(download_url, url) for url in urls]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
print(f"Exception during download: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,79 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import urllib.parse
|
||||
import requests
|
||||
from requests.exceptions import RequestException
|
||||
import concurrent.futures
|
||||
|
||||
|
||||
def download_url(url):
|
||||
proxies = {"http": "http://127.0.0.1:10808", "https": "http://127.0.0.1:10808"}
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
}
|
||||
|
||||
decoded_url = urllib.parse.unquote(url)
|
||||
volume_name = decoded_url.split("/")[-1]
|
||||
filename = f"html_files/{volume_name}.html"
|
||||
|
||||
if os.path.exists(filename):
|
||||
return True, filename
|
||||
|
||||
success = False
|
||||
retries = 5
|
||||
while not success and retries > 0:
|
||||
try:
|
||||
response = requests.get(url, headers=headers, proxies=proxies, timeout=15)
|
||||
if response.status_code == 200:
|
||||
with open(filename, "w", encoding="utf-8") as out_f:
|
||||
out_f.write(response.text)
|
||||
success = True
|
||||
print(f"Successfully downloaded {filename}")
|
||||
return True, filename
|
||||
elif response.status_code == 403:
|
||||
print(
|
||||
f"HTTP Error 403 for {url}. Waiting longer to avoid rate limit..."
|
||||
)
|
||||
time.sleep(5)
|
||||
else:
|
||||
print(
|
||||
f"HTTP Error {response.status_code} for {url}. Retries left: {retries - 1}"
|
||||
)
|
||||
except RequestException as e:
|
||||
pass
|
||||
|
||||
if not success:
|
||||
retries -= 1
|
||||
time.sleep(3)
|
||||
|
||||
if not success:
|
||||
print(f"Failed to download {url} after all retries.")
|
||||
return False, filename
|
||||
|
||||
return False, filename
|
||||
|
||||
|
||||
def main():
|
||||
if not os.path.exists("html_files"):
|
||||
os.makedirs("html_files")
|
||||
|
||||
with open("still_missing.txt", "r", encoding="utf-8") as f:
|
||||
urls = [line.strip() for line in f if line.strip()]
|
||||
|
||||
print(f"Starting download for {len(urls)} remaining files...")
|
||||
|
||||
# Use ThreadPoolExecutor to download multiple files concurrently
|
||||
# Reduced max_workers to 5 to avoid triggering 403 Forbidden rate limits
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||
futures = [executor.submit(download_url, url) for url in urls]
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
future.result()
|
||||
except Exception as e:
|
||||
print(f"Exception during download: {e}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,96 +0,0 @@
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7070%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7073%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7074%E4%B9%8B5
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7077%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7081%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7082%E4%B9%8B5
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7083%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7084%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7085%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B02
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B03
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B04
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B05
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B06
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B07
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B08
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B09
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7090%E4%B9%8B10
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7091%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7092%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B6
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B7
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7093%E4%B9%8B8
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7094%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B5
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7095%E4%B9%8B6
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7096%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7098%E4%B9%8B5
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B5
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B7
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B8
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B9
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7099%E4%B9%8B010
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B6
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B7
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B8
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B9
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B010
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B011
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7100%E4%B9%8B012
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7101%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B4
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B5
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B6
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B7
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7102%E4%B9%8B8
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7103%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7104%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B1
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B2
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7105%E4%B9%8B3
|
||||
https://zh.wikisource.org/wiki/%E5%BE%A1%E5%AE%9A%E4%BD%A9%E6%96%87%E9%9F%BB%E5%BA%9C_(%E5%9B%9B%E5%BA%AB%E5%85%A8%E6%9B%B8%E6%9C%AC)/%E5%8D%B7106%E4%B9%8B2
|
||||
@@ -1,53 +0,0 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
text = soup.find("div", class_="poem").get_text()
|
||||
|
||||
# Extract Tone and Rhyme
|
||||
tone_rhyme_m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*)", text)
|
||||
if tone_rhyme_m:
|
||||
print("Tone:", tone_rhyme_m.group(1))
|
||||
print("Rhyme:", tone_rhyme_m.group(2))
|
||||
|
||||
# Extract Volume
|
||||
vol_m = re.search(r"卷(.)之(.)", text)
|
||||
if vol_m:
|
||||
print("Volume:", f"0{vol_m.group(1)}之{vol_m.group(2)}" if vol_m.group(1) in '一二三四五六七八九' else f"{vol_m.group(1)}之{vol_m.group(2)}")
|
||||
|
||||
# Extract chars
|
||||
lines = text.split('\n')
|
||||
rhyme_chars = []
|
||||
for i, line in enumerate(lines):
|
||||
if tone_rhyme_m.group(2) in line:
|
||||
# the next line usually has the characters
|
||||
chars_line = lines[i+1]
|
||||
rhyme_chars = [c for c in chars_line.replace(' ', ' ').split() if len(c) == 1]
|
||||
break
|
||||
|
||||
print("Chars:", rhyme_chars)
|
||||
|
||||
# Now, we want to strip all lines that are headers.
|
||||
# Actually, headers repeat: "欽定四庫全書", "御定佩文韻府...", "上平聲..."
|
||||
# We can just filter out these known header lines!
|
||||
clean_lines = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped: continue
|
||||
if stripped == "欽定四庫全書": continue
|
||||
if stripped.startswith("御定佩文韻府"): continue
|
||||
if "平聲" in stripped or "上聲" in stripped or "去聲" in stripped or "入聲" in stripped:
|
||||
if "韻" in stripped: continue
|
||||
# Is it the chars line?
|
||||
if all(c in rhyme_chars or c in ' ' for c in stripped):
|
||||
continue
|
||||
clean_lines.append(stripped)
|
||||
|
||||
clean_text = "".join(clean_lines)
|
||||
print("Start of clean text:", clean_text[:100])
|
||||
|
||||
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", clean_text)
|
||||
print("First token:", tokens[0])
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_div = soup.find("div", class_="poem")
|
||||
if poem_div:
|
||||
for i, p in enumerate(poem_div.find_all("p")[:20]):
|
||||
print(f"--- P {i} ---")
|
||||
print(p.text[:100].replace('\n', ' '))
|
||||
@@ -1,12 +0,0 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_div = soup.find("div", class_="poem")
|
||||
if poem_div:
|
||||
lines = poem_div.get_text().split("\n")
|
||||
lines = [line.strip() for line in lines if line.strip()]
|
||||
for i, line in enumerate(lines[:50]):
|
||||
print(f"{i}: {line}")
|
||||
@@ -1,5 +0,0 @@
|
||||
import re
|
||||
text = "對語〈渭北 江東〉〈 平北 安東〉摘句〈力障百川東〉"
|
||||
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
|
||||
for i, (word, desc_blocks) in enumerate(tokens):
|
||||
print(f"Token {i}: WORD='{word}' DESCS={desc_blocks}")
|
||||
@@ -1,54 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
prefixes = ["韻藻", "增"]
|
||||
|
||||
def replace_pipes(content, word):
|
||||
clean_word = word
|
||||
for p in prefixes:
|
||||
if clean_word.startswith(p) and len(clean_word) > len(p):
|
||||
clean_word = clean_word[len(p):]
|
||||
break # only strip one prefix
|
||||
|
||||
word_len = len(clean_word)
|
||||
if word_len == 0:
|
||||
return content.replace("丨", "")
|
||||
|
||||
result = []
|
||||
pipe_idx = 0
|
||||
chars_since_last_pipe = 0
|
||||
|
||||
for char in content:
|
||||
if char == "丨":
|
||||
if chars_since_last_pipe >= 5:
|
||||
# Long gap -> reset pipe_idx!
|
||||
# Wait, only reset if we aren't in the middle of a perfect mapping?
|
||||
# Actually, if the gap is >=5, it's definitely a new occurrence.
|
||||
pipe_idx = 0
|
||||
|
||||
result.append(clean_word[pipe_idx % word_len])
|
||||
pipe_idx += 1
|
||||
chars_since_last_pipe = 0
|
||||
else:
|
||||
result.append(char)
|
||||
chars_since_last_pipe += 1
|
||||
|
||||
return "".join(result)
|
||||
|
||||
# Test specific words
|
||||
test_cases = [
|
||||
("首陽東", "詩采葑采葑丨丨之丨"),
|
||||
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
|
||||
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
|
||||
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
|
||||
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
|
||||
]
|
||||
|
||||
for w, c in test_cases:
|
||||
print(f"Word: {w}")
|
||||
print(f"Orig: {c}")
|
||||
print(f"Fix : {replace_pipes(c, w)}")
|
||||
print("-" * 40)
|
||||
@@ -1,37 +0,0 @@
|
||||
import json
|
||||
|
||||
prefixes = ["韻藻", "增"]
|
||||
|
||||
def replace_pipes_no_reset(content, word):
|
||||
clean_word = word
|
||||
for p in prefixes:
|
||||
if clean_word.startswith(p) and len(clean_word) > len(p):
|
||||
clean_word = clean_word[len(p):]
|
||||
break
|
||||
|
||||
word_len = len(clean_word)
|
||||
if word_len == 0:
|
||||
return content.replace("丨", "")
|
||||
|
||||
result = []
|
||||
pipe_idx = 0
|
||||
for char in content:
|
||||
if char == "丨":
|
||||
result.append(clean_word[pipe_idx % word_len])
|
||||
pipe_idx += 1
|
||||
else:
|
||||
result.append(char)
|
||||
return "".join(result)
|
||||
|
||||
test_cases = [
|
||||
("首陽東", "詩采葑采葑丨丨之丨"),
|
||||
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
|
||||
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
|
||||
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
|
||||
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
|
||||
]
|
||||
|
||||
for w, c in test_cases:
|
||||
print(f"Word: {w}")
|
||||
print(f"NoRst: {replace_pipes_no_reset(c, w)}")
|
||||
print("-" * 40)
|
||||
@@ -1,56 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
prefixes = ["韻藻", "增"]
|
||||
|
||||
def replace_pipes_hybrid(content, word):
|
||||
clean_word = word
|
||||
for p in prefixes:
|
||||
if clean_word.startswith(p) and len(clean_word) > len(p):
|
||||
clean_word = clean_word[len(p):]
|
||||
break
|
||||
|
||||
word_len = len(clean_word)
|
||||
if word_len == 0:
|
||||
return content.replace("丨", "")
|
||||
|
||||
def repl(match):
|
||||
nonlocal pipe_idx
|
||||
block = match.group(0)
|
||||
block_len = len(block)
|
||||
|
||||
if block_len % word_len == 0:
|
||||
# Full word match! Reset alignment.
|
||||
pipe_idx = 0
|
||||
return clean_word * (block_len // word_len)
|
||||
else:
|
||||
# Partial word match. Use current sequence.
|
||||
res = ""
|
||||
for _ in range(block_len):
|
||||
res += clean_word[pipe_idx % word_len]
|
||||
pipe_idx += 1
|
||||
return res
|
||||
|
||||
pipe_idx = 0
|
||||
return re.sub(r'丨+', repl, content)
|
||||
|
||||
test_cases = [
|
||||
("首陽東", "詩采葑采葑丨丨之丨"),
|
||||
("馬首東", "左傳欒黶曰吾丨丨欲丨乃歸下軍從之"),
|
||||
("澗瀍東", "書洛誥我乃卜丨水東丨水西惟洛食我又卜瀍水丨亦惟洛食"),
|
||||
("日夜東", "丨丨虞集詩絳桃風急丨丨丨王惲詩付與衡漳丨丨丨許有壬詩江水舟"),
|
||||
("東海東", "樓鑰詩萬里逺在丨丨丨張經詩崑崙之西丨"),
|
||||
]
|
||||
|
||||
for w, c in test_cases:
|
||||
print(f"Word: {w}")
|
||||
print(f"Orig: {c}")
|
||||
print(f"Hybr: {replace_pipes_hybrid(c, w)}")
|
||||
print("-" * 40)
|
||||
test_cases.append(("紫殿東", "時魚躍丨丨丨温庭筠詩一夕丨丨"))
|
||||
|
||||
for w, c in test_cases[-1:]:
|
||||
print(f"Word: {w}")
|
||||
print(f"Orig: {c}")
|
||||
print(f"Hybr: {replace_pipes_hybrid(c, w)}")
|
||||
print("-" * 40)
|
||||
@@ -1,31 +0,0 @@
|
||||
import json
|
||||
from parser import parse_html
|
||||
|
||||
def test_parse_html():
|
||||
file_path = "html_files/卷001之1.html"
|
||||
result = parse_html(file_path)
|
||||
|
||||
# Save for manual inspection
|
||||
with open("output.json", "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Check that it returns a dictionary
|
||||
assert isinstance(result, dict)
|
||||
|
||||
# Let's see what keys are in it
|
||||
keys = list(result.keys())
|
||||
print("Keys found:", keys)
|
||||
|
||||
if len(keys) > 0:
|
||||
first_key = keys[0]
|
||||
assert "卷" in result[first_key]
|
||||
assert "声" in result[first_key]
|
||||
assert "韵" in result[first_key]
|
||||
assert "小韵描述" in result[first_key]
|
||||
assert "词条" in result[first_key]
|
||||
assert "对语" in result[first_key]
|
||||
assert "摘句" in result[first_key]
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_parse_html()
|
||||
print("Tests passed!")
|
||||
@@ -1,49 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Analyze a few
|
||||
count_match = 0
|
||||
count_mismatch = 0
|
||||
|
||||
for rhyme, r_data in list(data.items())[:5]: # Skip metadata, preface
|
||||
if rhyme in ["metadata", "preface"]:
|
||||
continue
|
||||
print(f"\nRhyme: {rhyme}")
|
||||
|
||||
# 1. 小韵描述
|
||||
desc = r_data.get("小韵描述", "")
|
||||
desc_fixed = desc.replace("丨", rhyme)
|
||||
print(f"Desc original: {desc[:30]}...")
|
||||
print(f"Desc fixed: {desc_fixed[:30]}...")
|
||||
|
||||
# 2. 词条
|
||||
for word, content in list(r_data.get("词条", {}).items())[:5]:
|
||||
pipe_count = content.count("丨")
|
||||
word_len = len(word)
|
||||
if pipe_count == 0:
|
||||
continue
|
||||
|
||||
print(f"Word: {word} (len {word_len}), pipes: {pipe_count}")
|
||||
print(f"Original: {content}")
|
||||
|
||||
# Test replacing
|
||||
if pipe_count % word_len == 0:
|
||||
# We can replace them in groups of word_len
|
||||
fixed_content = ""
|
||||
pipe_idx = 0
|
||||
for char in content:
|
||||
if char == "丨":
|
||||
fixed_content += word[pipe_idx % word_len]
|
||||
pipe_idx += 1
|
||||
else:
|
||||
fixed_content += char
|
||||
print(f"Fixed: {fixed_content}")
|
||||
count_match += 1
|
||||
else:
|
||||
print("MISMATCH length!")
|
||||
count_mismatch += 1
|
||||
|
||||
print(f"\nMatches: {count_match}, Mismatches: {count_mismatch}")
|
||||
@@ -1,46 +0,0 @@
|
||||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
import urllib.parse
|
||||
import sys
|
||||
|
||||
|
||||
async def main():
|
||||
url = sys.argv[1]
|
||||
proxy_server = "socks5://127.0.0.1:10808"
|
||||
|
||||
async with async_playwright() as p:
|
||||
# Try with proxy
|
||||
browser = await p.chromium.launch(headless=True, proxy={"server": proxy_server})
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
print(f"Loading {url} via proxy...")
|
||||
await page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
||||
content = await page.content()
|
||||
print(f"Success! Content length: {len(content)}")
|
||||
except Exception as e:
|
||||
print(f"Error via proxy: {e}")
|
||||
await browser.close()
|
||||
|
||||
# Try without proxy
|
||||
print("Retrying without proxy...")
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
page = await context.new_page()
|
||||
try:
|
||||
await page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
||||
content = await page.content()
|
||||
print(f"Success! Content length: {len(content)}")
|
||||
except Exception as e2:
|
||||
print(f"Error without proxy: {e2}")
|
||||
|
||||
await browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,21 +0,0 @@
|
||||
import json
|
||||
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
mismatches = []
|
||||
prefixes_found = set()
|
||||
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ["metadata", "preface"]:
|
||||
continue
|
||||
for word, content in r_data.get("词条", {}).items():
|
||||
pipe_count = content.count("丨")
|
||||
if pipe_count == 0:
|
||||
continue
|
||||
if pipe_count % len(word) != 0:
|
||||
mismatches.append((word, pipe_count))
|
||||
|
||||
print(f"Total mismatches: {len(mismatches)}")
|
||||
for w, p in mismatches[:20]:
|
||||
print(f"{w} (len {len(w)}), pipes: {p}")
|
||||
@@ -1,13 +0,0 @@
|
||||
import json
|
||||
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ["metadata", "preface"]:
|
||||
continue
|
||||
for word, content in r_data.get("词条", {}).items():
|
||||
if word in ["紫殿東", "少微東", "東海東", "日夜東", "隔西東"]:
|
||||
print(f"Word: {word}")
|
||||
print(f"Content: {content}")
|
||||
print("-" * 40)
|
||||
@@ -1,18 +0,0 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_div = soup.find("div", class_="poem")
|
||||
text = poem_div.get_text()
|
||||
|
||||
# Extract the list of characters.
|
||||
# It appears after "一東韻一" or similar.
|
||||
m = re.search(r"(上平聲|下平聲|上聲|去聲|入聲)\s+(.*?韻.*?)\n(.*?)\n", text)
|
||||
if m:
|
||||
print("Tone:", m.group(1))
|
||||
print("Rhyme:", m.group(2))
|
||||
print("Chars line:", m.group(3))
|
||||
rhyme_chars = [c for c in m.group(3).replace(' ', ' ').split() if len(c) == 1]
|
||||
print("Chars:", rhyme_chars)
|
||||
@@ -1,32 +0,0 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
prefixes = ["韻藻", "增"]
|
||||
|
||||
mismatches = []
|
||||
total_pipes = 0
|
||||
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ["metadata", "preface"]:
|
||||
continue
|
||||
for word, content in r_data.get("词条", {}).items():
|
||||
clean_word = word
|
||||
for p in prefixes:
|
||||
if clean_word.startswith(p) and len(clean_word) > len(p):
|
||||
clean_word = clean_word[len(p):]
|
||||
|
||||
pipe_count = content.count("丨")
|
||||
if pipe_count == 0:
|
||||
continue
|
||||
total_pipes += 1
|
||||
if pipe_count % len(clean_word) != 0:
|
||||
mismatches.append((word, clean_word, pipe_count, content))
|
||||
|
||||
print(f"Total entries with pipes: {total_pipes}")
|
||||
print(f"Total mismatches after stripping: {len(mismatches)}")
|
||||
for w, cw, p, c in mismatches[:10]:
|
||||
print(f"{w} -> {cw} (len {len(cw)}), pipes: {p}")
|
||||
print(f" {c}")
|
||||
@@ -1,21 +0,0 @@
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
with open("html_files/卷001之1.html", "r", encoding="utf-8") as f:
|
||||
soup = BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
text = soup.find("div", class_="poem").get_text().replace('\n', '')
|
||||
|
||||
# Remove header junk for this test (just find the first '東〈')
|
||||
start_idx = text.find('東〈')
|
||||
text = text[start_idx:]
|
||||
|
||||
# Tokenize into pairs of (Word, Description)
|
||||
# Using regex to find all Word〈Description〉
|
||||
# Wait, multiple 〈 〉 can follow a word like 對語〈...〉〈...〉
|
||||
# We can find all chunks of non-〈 characters, followed by one or more 〈...〉
|
||||
tokens = re.findall(r"([^〈〉]*)((?:〈[^〉]+〉)+)", text)
|
||||
|
||||
for i, (word, desc_blocks) in enumerate(tokens[:20]):
|
||||
print(f"Token {i}: WORD='{word}' DESCS={desc_blocks[:30]}...")
|
||||
|
||||
93
分类.md
93
分类.md
@@ -0,0 +1,93 @@
|
||||
# 类书分类与特点梳理
|
||||
|
||||
为辅助专业诗歌创作小程序提供“知识库”,整理现有类书数据的优缺点、特点及在辅助创作中的核心价值如下:
|
||||
|
||||
## 一、词条式
|
||||
这类类书通常以主题(如天文地理、自然物候等)分类,适合根据意象(如“雪”、“月”)进行正向检索找语料。
|
||||
|
||||
- **《白孔六帖》**
|
||||
- **内容与编排顺序**:按天文地理、历法礼仪、生活物件等类目编排。
|
||||
- **特点**:采辑各种典籍中的成语、典故、短语,多为四字短句,并附带简短的释义。
|
||||
- **整理问题**:提取很不干净,正文与释义括号混合在一起且缺乏断句。
|
||||
- **辅助创作价值**:可作为提炼“四字骈语”或冷僻典故的原始素材库,但需大量NLP清洗。
|
||||
- **典型例子**:`"content": "白髙眀柔克(髙明天也柔克寒暑不干)隂隲下民(言天黙定下民之命)天尊(地卑)..."`
|
||||
|
||||
- **《北堂书钞》**
|
||||
- **内容与编排顺序**:按帝王、后妃、政术、刑法、官职、礼仪等社会制度与名物编排。
|
||||
- **特点**:成书于隋,引用起自三代、汉、魏,迄于宋、齐。侧重于对概念的追本溯源,对词条解释较为详细。
|
||||
- **整理问题**:JSON提取的正文部分缺乏标点,长句粘连。
|
||||
- **辅助创作价值**:古朴凝练。如创作者需要引用较为正统的经史概念(如写咏史诗),该书能提供最原汁原味的早期语料。
|
||||
- **典型例子**:`"content": "皇者天人之總稱 帝者天號 正氣爲帝 帝者天下之所適王者天下之所徃也..."`
|
||||
|
||||
- **《初学记》**
|
||||
- **内容与编排顺序**:按天、岁时、官职、地理等编排。每个词条下细分为“叙事”、“事对”、“诗文”。
|
||||
- **特点**:词条内容非常详细、层次分明,具有极强的结构化特征。
|
||||
- **整理问题**:整理得非常干净,JSON层级保留了原始分类结构。
|
||||
- **辅助创作价值**:**价值极高**。“事对”直接提供了现成的对仗词汇(写诗利器);“诗文”栏目则方便查阅前人咏此物的范本。
|
||||
- **典型例子**:`"事对": "轉葢 倚杵(桓譚新論天如葢轉左旋...) 覆盆 轉轂(王充論衡曰...)"`
|
||||
|
||||
- **《海录碎事》**
|
||||
- **内容与编排顺序**:按非常细碎的关键词(天、地、衣冠等)分类。
|
||||
- **特点**:每一类下词条过细(常为生僻两字词),每一词条下内容极少,通常只有一两句包含该词的引文。
|
||||
- **整理问题**:词条过度碎片化,键名就是细碎词汇。
|
||||
- **辅助创作价值**:相当于一个“逆向用词示例库”。诗人想用某个生僻意象时,用它查看古人如何将其嵌入诗句中。
|
||||
- **典型例子**:`"曽穹": [{"content": "蹀足循廣除瞬目矖曽穹(文選謝惠連詩)"}]`
|
||||
|
||||
- **《骈字类编》**
|
||||
- **内容与编排顺序**:按天地、时令、山水、珍宝、器物等词汇大类编排。
|
||||
- **特点**:专收“骈语”(双音节词),词条极多,详细列出了该词在各路经史子集中出现的位置。
|
||||
- **整理问题**:长段引文粘连,缺少现代标点。
|
||||
- **辅助创作价值**:**价值极高**。古诗词创作最核心的就是对“双字词汇”的拿捏,此书就是一个庞大且天然的古典双字词语境库。
|
||||
- **典型例子**:`"天地": "易干夫大人者与天地合其德 又坤天地变化草木蕃天地闭贤人隐..."`
|
||||
|
||||
- **《太平御览》**
|
||||
- **内容与编排顺序**:以天、地、人、事、物为大类顺序。
|
||||
- **特点**:在前代《修文殿御览》《艺文类聚》等书基础上编纂而成,包罗万象,词条内容全部为原文引文。
|
||||
- **整理问题**:带有原书排版格式(换行、`《书名》曰`),阅读体验较佳。
|
||||
- **辅助创作价值**:提供最详实的事物背景知识,适合在需要了解某个意象(如“雪”)的全面历史文化背景时使用。
|
||||
- **典型例子**:`"《三五曆記》曰:未有天地之時,混沌狀如雞子,溟涬始牙..."`
|
||||
|
||||
- **《艺文类聚》**
|
||||
- **内容与编排顺序**:按天、岁时、地理、帝王、人、乐、职官等编排。
|
||||
- **特点**:事文交织。词条收录较广,既有经史中的“叙事”,也有大量的历代【诗】、【赋】、【赞】。
|
||||
- **整理问题**:带有原书的标点和分段标记,格式清晰。
|
||||
- **辅助创作价值**:极好的文学创作资料库,帮助创作者一站式看到某个主题在古代诗文中的各种形态。
|
||||
- **典型例子**:`"【詩】晉傅玄《兩儀詩》曰:兩儀始分.元氣上清.列宿垂象.六位時成..."`
|
||||
|
||||
- **《玉海》**
|
||||
- **内容与编排顺序**:详分天文、地理、典章制度等。
|
||||
- **特点**:词条常为长篇大论,注重典章制度、天文地理的详细考证。
|
||||
- **整理问题**:存在词条名拆分或提取不精准的问题(如把“中宫二十八舍”拆断处理)。正文缺乏标点。
|
||||
- **辅助创作价值**:提供精准宏大的制度与天文星象知识,适合创作偏严肃或庙堂题材的诗歌。
|
||||
- **典型例子**:`"中宫": "漢天文志(史天官書同)中宫天極星其一明者泰一之常居也旁三星三公..."`
|
||||
|
||||
- **《渊鉴类函》**
|
||||
- **内容与编排顺序**:按大部类排布。
|
||||
- **特点**:清代集大成之作,将引文明确区分为“原”(原类书已有)与“增”(清代新增)。
|
||||
- **整理问题**:使用空格作为句读分隔,未见全角标点。
|
||||
- **辅助创作价值**:覆盖面最广的兜底宝库,适合查阅各种意象的演变和最全面的引文集合。
|
||||
- **典型例子**:`"原釋名曰天坦也坦然髙而逺也 增又曰天顯也在上髙顯也..."`
|
||||
|
||||
## 二、韵式
|
||||
这类类书专为押韵而生,以“韵母”或“韵字”为一级分类,适合在写格律诗卡壳、需要找特定韵脚词汇时使用。
|
||||
|
||||
- **《佩文韵府》**
|
||||
- **内容与编排顺序**:按平水韵分类(如“一东”),下系以该字为尾的各种词条及摘句。
|
||||
- **特点**:非常详细,包含声调、韵部说明,以及海量的带出处短句。以元代《韵府群玉》和明代《五车韵瑞》为基础增补。
|
||||
- **整理问题**:JSON结构层次非常清晰。使用“丨”符号代替原韵字(如“東”被替换为“丨”)。
|
||||
- **辅助创作价值**:**写诗必备神器**。想用“东”韵时,能瞬间获得大量以东结尾的词汇(如“南东”、“活东”)及例句,极大辅助押韵。
|
||||
- **典型例子**:`"(韵母)东": { "小韵描述": "东德红切眷方也...", "词条": { "活東": "爾雅科斗丨丨蝦蟇也...", "牆東": "後漢書避世丨丨王君公..." } }`
|
||||
|
||||
- **《韵府群玉》**
|
||||
- **内容与编排顺序**:按大韵分类,列出小韵和具体词条。
|
||||
- **特点**:早期的韵书,条目较为简练紧凑。
|
||||
- **整理问题**:条目内容被尖括号`〈〉`包裹,夹杂部分注音(如“徳紅切”)。
|
||||
- **辅助创作价值**:与佩文韵府同理,但体量更小,适合快速查阅核心的传统押韵典故。
|
||||
- **典型例子**:`"東": { "道東": "〈漢鄭𤣥事馬融辭歸融曰吾道東矣本〉" }`
|
||||
|
||||
- **《五车韵瑞》**
|
||||
- **内容与编排顺序**:/
|
||||
- **特点**:/
|
||||
- **整理问题**:**严重问题**,当前文件夹内的 `allorigins.json` 数据获取失败,内容实际上是 Nginx 的 `500 Internal Server Error` 报错网页代码,并非JSON数据。
|
||||
- **辅助创作价值**:暂时无价值。需要修复爬虫和数据源。
|
||||
- **典型例子**:`<html><head><title>500 Internal Server Error</title></head><body>...`
|
||||
@@ -1,161 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
events = []
|
||||
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
pass # ignore br, we want continuous text
|
||||
elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
|
||||
events.append(("section", child.text.strip()))
|
||||
elif child.name == "small":
|
||||
small_text = child.get_text()
|
||||
if not small_text.startswith("〈") and not small_text.startswith("("):
|
||||
events.append(("text", f"({small_text})"))
|
||||
else:
|
||||
events.append(("text", small_text))
|
||||
elif isinstance(child, str):
|
||||
events.append(("text", child))
|
||||
else:
|
||||
events.append(("text", child.get_text()))
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def extract_sections(content_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
rest_after_narrative = content_text
|
||||
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
|
||||
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
for k in result:
|
||||
result[k] = result[k].replace("〈", "(").replace("〉", ")").strip()
|
||||
|
||||
return result
|
||||
|
||||
def main():
|
||||
categories = {}
|
||||
total_entries_found = 0
|
||||
|
||||
for vol in range(1, 31):
|
||||
filename = None
|
||||
for fn in os.listdir(html_dir):
|
||||
if fn.endswith(f"juan{vol:02d}.xhtml"):
|
||||
filename = fn
|
||||
break
|
||||
|
||||
if not filename:
|
||||
print(f"Volume {vol} not found")
|
||||
continue
|
||||
|
||||
events = parse_html(os.path.join(html_dir, filename))
|
||||
|
||||
merged = []
|
||||
current_section = ""
|
||||
current_text = []
|
||||
|
||||
for ev_type, val in events:
|
||||
if ev_type == "section":
|
||||
if current_text:
|
||||
merged.append((current_section, "".join(current_text)))
|
||||
current_text = []
|
||||
current_section = val
|
||||
else:
|
||||
current_text.append(val)
|
||||
|
||||
if current_text:
|
||||
merged.append((current_section, "".join(current_text)))
|
||||
|
||||
for sec, txt in merged:
|
||||
matches = list(re.finditer(r"(?:^|[〉)\u3000\s])([^〈〉<>()\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))
|
||||
|
||||
if not matches:
|
||||
continue
|
||||
|
||||
for i, m in enumerate(matches):
|
||||
entry_name = m.group(1).strip()
|
||||
start_idx = m.end() # Start of the content AFTER 〈叙事〉
|
||||
# Find the actual start index of the next match
|
||||
# m.start(1) skips the optional prefix character!
|
||||
# Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
|
||||
# If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
|
||||
# Actually, `matches[i+1].start()` includes the boundary char `〉`.
|
||||
# We should stop at `matches[i+1].start(1) - len(prefix)`?
|
||||
# Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
|
||||
# E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
|
||||
# The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
|
||||
# So we MUST include it!
|
||||
# `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
|
||||
# This is EXACTLY what we want!
|
||||
end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)
|
||||
|
||||
content_text = txt[start_idx:end_idx]
|
||||
|
||||
sections = extract_sections(content_text)
|
||||
|
||||
if entry_name not in categories:
|
||||
categories[entry_name] = []
|
||||
|
||||
categories[entry_name].append({
|
||||
"volume": vol,
|
||||
"section": sec,
|
||||
"content": sections
|
||||
})
|
||||
total_entries_found += 1
|
||||
|
||||
final_json = {
|
||||
"metadata": {
|
||||
"title": "初学记",
|
||||
"author": "徐坚",
|
||||
"dynasty": "唐",
|
||||
"total_volumes": 30,
|
||||
"source": "2026年1月28日从维基文库导出"
|
||||
},
|
||||
"preface": "",
|
||||
"categories": categories
|
||||
}
|
||||
|
||||
with open("初学记.json", "w", encoding="utf-8") as f:
|
||||
json.dump(final_json, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,24 +0,0 @@
|
||||
import bs4
|
||||
import re
|
||||
import sys
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
current_section = None
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
for div in poem_divs:
|
||||
# Before we process children, let's look at p tags
|
||||
for p in div.find_all("p"):
|
||||
for child in p.children:
|
||||
if child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
|
||||
current_section = child.text.strip()
|
||||
print("Found Section:", current_section)
|
||||
elif child.name == "br":
|
||||
pass
|
||||
elif type(child) == bs4.element.NavigableString:
|
||||
pass
|
||||
|
||||
parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
@@ -1,54 +0,0 @@
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
|
||||
events = []
|
||||
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
current_line = []
|
||||
elif (
|
||||
child.name == "span"
|
||||
and child.get("id")
|
||||
and child.text.strip().endswith("部")
|
||||
):
|
||||
events.append(("section", child.text.strip()))
|
||||
elif child.name == "small":
|
||||
# Convert small text to brackets
|
||||
small_text = child.get_text()
|
||||
# It might already have brackets inside.
|
||||
if not small_text.startswith("〈") and not small_text.startswith(
|
||||
"("
|
||||
):
|
||||
current_line.append(f"({small_text})")
|
||||
else:
|
||||
# Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
|
||||
# Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
|
||||
# Let's look at child.get_text()
|
||||
current_line.append(small_text)
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
return events
|
||||
|
||||
|
||||
events = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
for e in events[:30]:
|
||||
print(e)
|
||||
@@ -1,127 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import re
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
|
||||
events = []
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
current_line = []
|
||||
elif (
|
||||
child.name == "span"
|
||||
and child.get("id")
|
||||
and child.text.strip().endswith("部")
|
||||
):
|
||||
events.append(("section", child.text.strip()))
|
||||
elif child.name == "small":
|
||||
small_text = child.get_text()
|
||||
if not small_text.startswith("〈") and not small_text.startswith(
|
||||
"("
|
||||
):
|
||||
current_line.append(f"〈{small_text}〉")
|
||||
else:
|
||||
current_line.append(small_text)
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
return events
|
||||
|
||||
|
||||
def extract_sections(entry_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
|
||||
if not narrative_match:
|
||||
return result
|
||||
|
||||
rest_after_narrative = narrative_match.group(2)
|
||||
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
|
||||
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
for k in result:
|
||||
result[k] = result[k].replace("〈", "(").replace("〉", ")")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
events = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
|
||||
entries = []
|
||||
current_entry_text = []
|
||||
|
||||
for ev_type, text in events:
|
||||
if ev_type == "section":
|
||||
pass
|
||||
else:
|
||||
if "〈叙事〉" in text or "〈敘事〉" in text:
|
||||
if current_entry_text:
|
||||
full_text = "".join(current_entry_text)
|
||||
match = re.search(
|
||||
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
|
||||
full_text,
|
||||
)
|
||||
if match:
|
||||
entry_name = match.group(1).strip()
|
||||
sections = extract_sections(full_text)
|
||||
entries.append((entry_name, sections))
|
||||
current_entry_text = [text]
|
||||
else:
|
||||
current_entry_text.append(text)
|
||||
|
||||
if current_entry_text:
|
||||
full_text = "".join(current_entry_text)
|
||||
match = re.search(
|
||||
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
|
||||
)
|
||||
if match:
|
||||
entry_name = match.group(1).strip()
|
||||
sections = extract_sections(full_text)
|
||||
entries.append((entry_name, sections))
|
||||
|
||||
for entry_name, sections in entries[:3]:
|
||||
print("Entry:", entry_name)
|
||||
print("叙事:", sections["叙事"][:50])
|
||||
print("事对:", sections["事对"][:50])
|
||||
print("诗文:", sections["诗文"][:50])
|
||||
print("-" * 20)
|
||||
@@ -1,35 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
texts = []
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
# We want to process the text inside <p> while respecting <br/> as separators.
|
||||
# But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
texts.append("".join(current_line).strip())
|
||||
current_line = []
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
texts.append("".join(current_line).strip())
|
||||
return texts
|
||||
|
||||
|
||||
texts = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
for i, t in enumerate(texts[:50]):
|
||||
print(f"{i}: {t}")
|
||||
@@ -1,93 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
texts = []
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
texts.append("".join(current_line).strip())
|
||||
current_line = []
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
texts.append("".join(current_line).strip())
|
||||
return texts
|
||||
|
||||
|
||||
def extract_sections(entry_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
# Extract 叙事
|
||||
narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
|
||||
if not narrative_match:
|
||||
return result
|
||||
|
||||
rest_after_narrative = narrative_match.group(1)
|
||||
|
||||
# Find 事对
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] # skip "事對"
|
||||
|
||||
# Find 诗文 start
|
||||
# Match 〉 followed by a literary genre
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1 # keep the genre character
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
# No 事对
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
return result
|
||||
|
||||
|
||||
texts = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
entries = []
|
||||
for line in texts:
|
||||
if "〈叙事〉" in line:
|
||||
entry_name = line.split("〈叙事〉")[0]
|
||||
# remove "第X" from entry_name
|
||||
word_name = re.sub(
|
||||
r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
|
||||
).strip()
|
||||
sections = extract_sections(line)
|
||||
entries.append((word_name, entry_name, sections))
|
||||
|
||||
for e in entries[:3]:
|
||||
print(f"Word: {e[0]}")
|
||||
print(f"Entry: {e[1]}")
|
||||
print(f"叙事 len: {len(e[2]['叙事'])}")
|
||||
print(f"事对 len: {len(e[2]['事对'])}")
|
||||
print(f"诗文 len: {len(e[2]['诗文'])}")
|
||||
print("-" * 20)
|
||||
@@ -1,41 +0,0 @@
|
||||
import re
|
||||
|
||||
def extract_sections(content_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
rest_after_narrative = content_text
|
||||
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对|銘|論)"
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
|
||||
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
for k in result:
|
||||
result[k] = result[k].replace("〈", "(").replace("〉", ")")
|
||||
|
||||
return result
|
||||
|
||||
import json
|
||||
print(extract_sections("這里是叙事事對這裡是事對〉詩這里是詩文"))
|
||||
Reference in New Issue
Block a user