94 lines
3.1 KiB
Python
94 lines
3.1 KiB
Python
import bs4
|
|
import os
|
|
import json
|
|
import re
|
|
|
|
html_dir = "epub_extracted/OPS"
|
|
|
|
|
|
def parse_html(filepath):
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
|
|
|
poem_divs = soup.find_all("div", class_="poem")
|
|
texts = []
|
|
for div in poem_divs:
|
|
for p in div.find_all("p"):
|
|
current_line = []
|
|
for child in p.children:
|
|
if child.name == "br":
|
|
texts.append("".join(current_line).strip())
|
|
current_line = []
|
|
else:
|
|
current_line.append(child.get_text())
|
|
if current_line:
|
|
texts.append("".join(current_line).strip())
|
|
return texts
|
|
|
|
|
|
def extract_sections(entry_text):
|
|
result = {"叙事": "", "事对": "", "诗文": ""}
|
|
|
|
# Extract 叙事
|
|
narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
|
|
if not narrative_match:
|
|
return result
|
|
|
|
rest_after_narrative = narrative_match.group(1)
|
|
|
|
# Find 事对
|
|
shìduì_start = rest_after_narrative.find("事對")
|
|
if shìduì_start == -1:
|
|
shìduì_start = rest_after_narrative.find("事对")
|
|
|
|
if shìduì_start != -1:
|
|
result["叙事"] = rest_after_narrative[:shìduì_start]
|
|
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] # skip "事對"
|
|
|
|
# Find 诗文 start
|
|
# Match 〉 followed by a literary genre
|
|
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
|
|
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
|
|
|
if shiwen_match:
|
|
split_idx = shiwen_match.start() + 1 # keep the genre character
|
|
result["事对"] = rest_after_shiduì[:split_idx]
|
|
result["诗文"] = rest_after_shiduì[split_idx:]
|
|
else:
|
|
result["事对"] = rest_after_shiduì
|
|
else:
|
|
# No 事对
|
|
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
|
|
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
|
if shiwen_match:
|
|
split_idx = shiwen_match.start() + 1
|
|
result["叙事"] = rest_after_narrative[:split_idx]
|
|
result["诗文"] = rest_after_narrative[split_idx:]
|
|
else:
|
|
result["叙事"] = rest_after_narrative
|
|
|
|
return result
|
|
|
|
|
|
texts = parse_html(
|
|
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
|
)
|
|
entries = []
|
|
for line in texts:
|
|
if "〈叙事〉" in line:
|
|
entry_name = line.split("〈叙事〉")[0]
|
|
# remove "第X" from entry_name
|
|
word_name = re.sub(
|
|
r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
|
|
).strip()
|
|
sections = extract_sections(line)
|
|
entries.append((word_name, entry_name, sections))
|
|
|
|
for e in entries[:3]:
|
|
print(f"Word: {e[0]}")
|
|
print(f"Entry: {e[1]}")
|
|
print(f"叙事 len: {len(e[2]['叙事'])}")
|
|
print(f"事对 len: {len(e[2]['事对'])}")
|
|
print(f"诗文 len: {len(e[2]['诗文'])}")
|
|
print("-" * 20)
|