Files
spider-ctext/初学记/generate_json.py
2026-03-22 16:18:35 +08:00

162 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import bs4
import os
import re
import json
html_dir = "epub_extracted/OPS"
def parse_html(filepath):
with open(filepath, "r", encoding="utf-8") as f:
soup = bs4.BeautifulSoup(f.read(), "html.parser")
poem_divs = soup.find_all("div", class_="poem")
events = []
for div in poem_divs:
for p in div.find_all("p"):
for child in p.children:
if child.name == "br":
pass # ignore br, we want continuous text
elif child.name == "span" and child.get("id") and child.text.strip().endswith(""):
events.append(("section", child.text.strip()))
elif child.name == "small":
small_text = child.get_text()
if not small_text.startswith("") and not small_text.startswith(""):
events.append(("text", f"{small_text}"))
else:
events.append(("text", small_text))
elif isinstance(child, str):
events.append(("text", child))
else:
events.append(("text", child.get_text()))
return events
def extract_sections(content_text):
result = {"叙事": "", "事对": "", "诗文": ""}
rest_after_narrative = content_text
shìduì_start = rest_after_narrative.find("事對")
if shìduì_start == -1:
shìduì_start = rest_after_narrative.find("事对")
genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"
if shìduì_start != -1:
result["叙事"] = rest_after_narrative[:shìduì_start]
rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
shiwen_match = re.search(genre_pattern, rest_after_shiduì)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["事对"] = rest_after_shiduì[:split_idx]
result["诗文"] = rest_after_shiduì[split_idx:]
else:
result["事对"] = rest_after_shiduì
else:
shiwen_match = re.search(genre_pattern, rest_after_narrative)
if shiwen_match:
split_idx = shiwen_match.start() + 1
result["叙事"] = rest_after_narrative[:split_idx]
result["诗文"] = rest_after_narrative[split_idx:]
else:
result["叙事"] = rest_after_narrative
for k in result:
result[k] = result[k].replace("", "").replace("", "").strip()
return result
def main():
categories = {}
total_entries_found = 0
for vol in range(1, 31):
filename = None
for fn in os.listdir(html_dir):
if fn.endswith(f"juan{vol:02d}.xhtml"):
filename = fn
break
if not filename:
print(f"Volume {vol} not found")
continue
events = parse_html(os.path.join(html_dir, filename))
merged = []
current_section = ""
current_text = []
for ev_type, val in events:
if ev_type == "section":
if current_text:
merged.append((current_section, "".join(current_text)))
current_text = []
current_section = val
else:
current_text.append(val)
if current_text:
merged.append((current_section, "".join(current_text)))
for sec, txt in merged:
matches = list(re.finditer(r"(?:^|[〉)\u3000\s])([^〈〉<>\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))
if not matches:
continue
for i, m in enumerate(matches):
entry_name = m.group(1).strip()
start_idx = m.end() # Start of the content AFTER 〈叙事〉
# Find the actual start index of the next match
# m.start(1) skips the optional prefix character!
# Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
# If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
# Actually, `matches[i+1].start()` includes the boundary char `〉`.
# We should stop at `matches[i+1].start(1) - len(prefix)`?
# Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
# E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
# The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
# So we MUST include it!
# `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
# This is EXACTLY what we want!
end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)
content_text = txt[start_idx:end_idx]
sections = extract_sections(content_text)
if entry_name not in categories:
categories[entry_name] = []
categories[entry_name].append({
"volume": vol,
"section": sec,
"content": sections
})
total_entries_found += 1
final_json = {
"metadata": {
"title": "初学记",
"author": "徐坚",
"dynasty": "",
"total_volumes": 30,
"source": "2026年1月28日从维基文库导出"
},
"preface": "",
"categories": categories
}
with open("初学记.json", "w", encoding="utf-8") as f:
json.dump(final_json, f, ensure_ascii=False, indent=2)
print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")
if __name__ == "__main__":
main()