Update: 删除垃圾程序
This commit is contained in:
@@ -1,161 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
events = []
|
||||
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
pass # ignore br, we want continuous text
|
||||
elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
|
||||
events.append(("section", child.text.strip()))
|
||||
elif child.name == "small":
|
||||
small_text = child.get_text()
|
||||
if not small_text.startswith("〈") and not small_text.startswith("("):
|
||||
events.append(("text", f"({small_text})"))
|
||||
else:
|
||||
events.append(("text", small_text))
|
||||
elif isinstance(child, str):
|
||||
events.append(("text", child))
|
||||
else:
|
||||
events.append(("text", child.get_text()))
|
||||
|
||||
return events
|
||||
|
||||
|
||||
def extract_sections(content_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
rest_after_narrative = content_text
|
||||
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
|
||||
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
for k in result:
|
||||
result[k] = result[k].replace("〈", "(").replace("〉", ")").strip()
|
||||
|
||||
return result
|
||||
|
||||
def main():
|
||||
categories = {}
|
||||
total_entries_found = 0
|
||||
|
||||
for vol in range(1, 31):
|
||||
filename = None
|
||||
for fn in os.listdir(html_dir):
|
||||
if fn.endswith(f"juan{vol:02d}.xhtml"):
|
||||
filename = fn
|
||||
break
|
||||
|
||||
if not filename:
|
||||
print(f"Volume {vol} not found")
|
||||
continue
|
||||
|
||||
events = parse_html(os.path.join(html_dir, filename))
|
||||
|
||||
merged = []
|
||||
current_section = ""
|
||||
current_text = []
|
||||
|
||||
for ev_type, val in events:
|
||||
if ev_type == "section":
|
||||
if current_text:
|
||||
merged.append((current_section, "".join(current_text)))
|
||||
current_text = []
|
||||
current_section = val
|
||||
else:
|
||||
current_text.append(val)
|
||||
|
||||
if current_text:
|
||||
merged.append((current_section, "".join(current_text)))
|
||||
|
||||
for sec, txt in merged:
|
||||
matches = list(re.finditer(r"(?:^|[〉)\u3000\s])([^〈〉<>()\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))
|
||||
|
||||
if not matches:
|
||||
continue
|
||||
|
||||
for i, m in enumerate(matches):
|
||||
entry_name = m.group(1).strip()
|
||||
start_idx = m.end() # Start of the content AFTER 〈叙事〉
|
||||
# Find the actual start index of the next match
|
||||
# m.start(1) skips the optional prefix character!
|
||||
# Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
|
||||
# If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
|
||||
# Actually, `matches[i+1].start()` includes the boundary char `〉`.
|
||||
# We should stop at `matches[i+1].start(1) - len(prefix)`?
|
||||
# Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
|
||||
# E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
|
||||
# The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
|
||||
# So we MUST include it!
|
||||
# `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
|
||||
# This is EXACTLY what we want!
|
||||
end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)
|
||||
|
||||
content_text = txt[start_idx:end_idx]
|
||||
|
||||
sections = extract_sections(content_text)
|
||||
|
||||
if entry_name not in categories:
|
||||
categories[entry_name] = []
|
||||
|
||||
categories[entry_name].append({
|
||||
"volume": vol,
|
||||
"section": sec,
|
||||
"content": sections
|
||||
})
|
||||
total_entries_found += 1
|
||||
|
||||
final_json = {
|
||||
"metadata": {
|
||||
"title": "初学记",
|
||||
"author": "徐坚",
|
||||
"dynasty": "唐",
|
||||
"total_volumes": 30,
|
||||
"source": "2026年1月28日从维基文库导出"
|
||||
},
|
||||
"preface": "",
|
||||
"categories": categories
|
||||
}
|
||||
|
||||
with open("初学记.json", "w", encoding="utf-8") as f:
|
||||
json.dump(final_json, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,24 +0,0 @@
|
||||
import bs4
|
||||
import re
|
||||
import sys
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
current_section = None
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
for div in poem_divs:
|
||||
# Before we process children, let's look at p tags
|
||||
for p in div.find_all("p"):
|
||||
for child in p.children:
|
||||
if child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
|
||||
current_section = child.text.strip()
|
||||
print("Found Section:", current_section)
|
||||
elif child.name == "br":
|
||||
pass
|
||||
elif type(child) == bs4.element.NavigableString:
|
||||
pass
|
||||
|
||||
parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
@@ -1,54 +0,0 @@
|
||||
import bs4
|
||||
import re
|
||||
import os
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
|
||||
events = []
|
||||
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
current_line = []
|
||||
elif (
|
||||
child.name == "span"
|
||||
and child.get("id")
|
||||
and child.text.strip().endswith("部")
|
||||
):
|
||||
events.append(("section", child.text.strip()))
|
||||
elif child.name == "small":
|
||||
# Convert small text to brackets
|
||||
small_text = child.get_text()
|
||||
# It might already have brackets inside.
|
||||
if not small_text.startswith("〈") and not small_text.startswith(
|
||||
"("
|
||||
):
|
||||
current_line.append(f"({small_text})")
|
||||
else:
|
||||
# Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
|
||||
# Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
|
||||
# Let's look at child.get_text()
|
||||
current_line.append(small_text)
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
return events
|
||||
|
||||
|
||||
events = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
for e in events[:30]:
|
||||
print(e)
|
||||
@@ -1,127 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import re
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
|
||||
events = []
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
current_line = []
|
||||
elif (
|
||||
child.name == "span"
|
||||
and child.get("id")
|
||||
and child.text.strip().endswith("部")
|
||||
):
|
||||
events.append(("section", child.text.strip()))
|
||||
elif child.name == "small":
|
||||
small_text = child.get_text()
|
||||
if not small_text.startswith("〈") and not small_text.startswith(
|
||||
"("
|
||||
):
|
||||
current_line.append(f"〈{small_text}〉")
|
||||
else:
|
||||
current_line.append(small_text)
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
events.append(("line", "".join(current_line).strip()))
|
||||
return events
|
||||
|
||||
|
||||
def extract_sections(entry_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
|
||||
if not narrative_match:
|
||||
return result
|
||||
|
||||
rest_after_narrative = narrative_match.group(2)
|
||||
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
|
||||
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
for k in result:
|
||||
result[k] = result[k].replace("〈", "(").replace("〉", ")")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
events = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
|
||||
entries = []
|
||||
current_entry_text = []
|
||||
|
||||
for ev_type, text in events:
|
||||
if ev_type == "section":
|
||||
pass
|
||||
else:
|
||||
if "〈叙事〉" in text or "〈敘事〉" in text:
|
||||
if current_entry_text:
|
||||
full_text = "".join(current_entry_text)
|
||||
match = re.search(
|
||||
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
|
||||
full_text,
|
||||
)
|
||||
if match:
|
||||
entry_name = match.group(1).strip()
|
||||
sections = extract_sections(full_text)
|
||||
entries.append((entry_name, sections))
|
||||
current_entry_text = [text]
|
||||
else:
|
||||
current_entry_text.append(text)
|
||||
|
||||
if current_entry_text:
|
||||
full_text = "".join(current_entry_text)
|
||||
match = re.search(
|
||||
r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
|
||||
)
|
||||
if match:
|
||||
entry_name = match.group(1).strip()
|
||||
sections = extract_sections(full_text)
|
||||
entries.append((entry_name, sections))
|
||||
|
||||
for entry_name, sections in entries[:3]:
|
||||
print("Entry:", entry_name)
|
||||
print("叙事:", sections["叙事"][:50])
|
||||
print("事对:", sections["事对"][:50])
|
||||
print("诗文:", sections["诗文"][:50])
|
||||
print("-" * 20)
|
||||
@@ -1,35 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
texts = []
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
# We want to process the text inside <p> while respecting <br/> as separators.
|
||||
# But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
texts.append("".join(current_line).strip())
|
||||
current_line = []
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
texts.append("".join(current_line).strip())
|
||||
return texts
|
||||
|
||||
|
||||
texts = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
for i, t in enumerate(texts[:50]):
|
||||
print(f"{i}: {t}")
|
||||
@@ -1,93 +0,0 @@
|
||||
import bs4
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
html_dir = "epub_extracted/OPS"
|
||||
|
||||
|
||||
def parse_html(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
soup = bs4.BeautifulSoup(f.read(), "html.parser")
|
||||
|
||||
poem_divs = soup.find_all("div", class_="poem")
|
||||
texts = []
|
||||
for div in poem_divs:
|
||||
for p in div.find_all("p"):
|
||||
current_line = []
|
||||
for child in p.children:
|
||||
if child.name == "br":
|
||||
texts.append("".join(current_line).strip())
|
||||
current_line = []
|
||||
else:
|
||||
current_line.append(child.get_text())
|
||||
if current_line:
|
||||
texts.append("".join(current_line).strip())
|
||||
return texts
|
||||
|
||||
|
||||
def extract_sections(entry_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
# Extract 叙事
|
||||
narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
|
||||
if not narrative_match:
|
||||
return result
|
||||
|
||||
rest_after_narrative = narrative_match.group(1)
|
||||
|
||||
# Find 事对
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :] # skip "事對"
|
||||
|
||||
# Find 诗文 start
|
||||
# Match 〉 followed by a literary genre
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1 # keep the genre character
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
# No 事对
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
return result
|
||||
|
||||
|
||||
texts = parse_html(
|
||||
os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
|
||||
)
|
||||
entries = []
|
||||
for line in texts:
|
||||
if "〈叙事〉" in line:
|
||||
entry_name = line.split("〈叙事〉")[0]
|
||||
# remove "第X" from entry_name
|
||||
word_name = re.sub(
|
||||
r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
|
||||
).strip()
|
||||
sections = extract_sections(line)
|
||||
entries.append((word_name, entry_name, sections))
|
||||
|
||||
for e in entries[:3]:
|
||||
print(f"Word: {e[0]}")
|
||||
print(f"Entry: {e[1]}")
|
||||
print(f"叙事 len: {len(e[2]['叙事'])}")
|
||||
print(f"事对 len: {len(e[2]['事对'])}")
|
||||
print(f"诗文 len: {len(e[2]['诗文'])}")
|
||||
print("-" * 20)
|
||||
@@ -1,41 +0,0 @@
|
||||
import re
|
||||
|
||||
def extract_sections(content_text):
|
||||
result = {"叙事": "", "事对": "", "诗文": ""}
|
||||
|
||||
rest_after_narrative = content_text
|
||||
|
||||
shìduì_start = rest_after_narrative.find("事對")
|
||||
if shìduì_start == -1:
|
||||
shìduì_start = rest_after_narrative.find("事对")
|
||||
|
||||
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对|銘|論)"
|
||||
|
||||
if shìduì_start != -1:
|
||||
result["叙事"] = rest_after_narrative[:shìduì_start]
|
||||
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
|
||||
|
||||
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
|
||||
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["事对"] = rest_after_shiduì[:split_idx]
|
||||
result["诗文"] = rest_after_shiduì[split_idx:]
|
||||
else:
|
||||
result["事对"] = rest_after_shiduì
|
||||
else:
|
||||
shiwen_match = re.search(genre_pattern, rest_after_narrative)
|
||||
if shiwen_match:
|
||||
split_idx = shiwen_match.start() + 1
|
||||
result["叙事"] = rest_after_narrative[:split_idx]
|
||||
result["诗文"] = rest_after_narrative[split_idx:]
|
||||
else:
|
||||
result["叙事"] = rest_after_narrative
|
||||
|
||||
for k in result:
|
||||
result[k] = result[k].replace("〈", "(").replace("〉", ")")
|
||||
|
||||
return result
|
||||
|
||||
import json
|
||||
print(extract_sections("這里是叙事事對這裡是事對〉詩這里是詩文"))
|
||||
Reference in New Issue
Block a user