Update: 删除垃圾程序

2026-03-22 16:43:10 +08:00
parent 183b842090
commit 2881163106
41 changed files with 93 additions and 2046 deletions
--- a/初学记/generate_json.py
+++ b/初学记/generate_json.py
@@ -1,161 +0,0 @@
-import bs4
-import os
-import re
-import json
-
-html_dir = "epub_extracted/OPS"
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-    events = []
-    
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            for child in p.children:
-                if child.name == "br":
-                    pass # ignore br, we want continuous text
-                elif child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
-                    events.append(("section", child.text.strip()))
-                elif child.name == "small":
-                    small_text = child.get_text()
-                    if not small_text.startswith("〈") and not small_text.startswith("（"):
-                        events.append(("text", f"（{small_text}）"))
-                    else:
-                        events.append(("text", small_text))
-                elif isinstance(child, str):
-                    events.append(("text", child))
-                else:
-                    events.append(("text", child.get_text()))
-                    
-    return events
-
-
-def extract_sections(content_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    rest_after_narrative = content_text
-
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对)"
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
-
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    for k in result:
-        result[k] = result[k].replace("〈", "（").replace("〉", "）").strip()
-
-    return result
-
-def main():
-    categories = {}
-    total_entries_found = 0
-    
-    for vol in range(1, 31):
-        filename = None
-        for fn in os.listdir(html_dir):
-            if fn.endswith(f"juan{vol:02d}.xhtml"):
-                filename = fn
-                break
-        
-        if not filename:
-            print(f"Volume {vol} not found")
-            continue
-            
-        events = parse_html(os.path.join(html_dir, filename))
-        
-        merged = []
-        current_section = ""
-        current_text = []
-
-        for ev_type, val in events:
-            if ev_type == "section":
-                if current_text:
-                    merged.append((current_section, "".join(current_text)))
-                    current_text = []
-                current_section = val
-            else:
-                current_text.append(val)
-                
-        if current_text:
-            merged.append((current_section, "".join(current_text)))
-            
-        for sec, txt in merged:
-            matches = list(re.finditer(r"(?:^|[〉）\u3000\s])([^〈〉<>（）\s]+?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", txt))
-            
-            if not matches:
-                continue
-                
-            for i, m in enumerate(matches):
-                entry_name = m.group(1).strip()
-                start_idx = m.end() # Start of the content AFTER 〈叙事〉
-                # Find the actual start index of the next match
-                # m.start(1) skips the optional prefix character!
-                # Wait! matches[i+1].start() gives the start of the whole match, including the optional prefix.
-                # If we use matches[i+1].start(), we might include the `〉` from the END of this entry inside the next entry's ignored space.
-                # Actually, `matches[i+1].start()` includes the boundary char `〉`.
-                # We should stop at `matches[i+1].start(1) - len(prefix)`?
-                # Actually, if the next entry starts with `〉`, that `〉` BELONGS to the CURRENT entry's poetry!
-                # E.g. `...終負昔賢心〉虹蜺第七〈敘事〉`
-                # The `〉` is the closing bracket for `蘇味道詠虹詩〈...終負昔賢心`.
-                # So we MUST include it! 
-                # `matches[i+1].start(1)` points EXACTLY to `虹`, so `txt[start_idx:matches[i+1].start(1)]` WILL INCLUDE `〉` at the end!
-                # This is EXACTLY what we want!
-                end_idx = matches[i+1].start(1) if i + 1 < len(matches) else len(txt)
-                
-                content_text = txt[start_idx:end_idx]
-                
-                sections = extract_sections(content_text)
-                
-                if entry_name not in categories:
-                    categories[entry_name] = []
-                    
-                categories[entry_name].append({
-                    "volume": vol,
-                    "section": sec,
-                    "content": sections
-                })
-                total_entries_found += 1
-
-    final_json = {
-        "metadata": {
-            "title": "初学记",
-            "author": "徐坚",
-            "dynasty": "唐",
-            "total_volumes": 30,
-            "source": "2026年1月28日从维基文库导出"
-        },
-        "preface": "",
-        "categories": categories
-    }
-
-    with open("初学记.json", "w", encoding="utf-8") as f:
-        json.dump(final_json, f, ensure_ascii=False, indent=2)
-        
-    print(f"Generated 初学记.json with {len(categories)} unique categories and {total_entries_found} total entries.")
-
-if __name__ == "__main__":
-    main()
--- a/初学记/test_extract.py
+++ b/初学记/test_extract.py
@@ -1,24 +0,0 @@
-import bs4
-import re
-import sys
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-    
-    current_section = None
-    
-    poem_divs = soup.find_all("div", class_="poem")
-    for div in poem_divs:
-        # Before we process children, let's look at p tags
-        for p in div.find_all("p"):
-            for child in p.children:
-                if child.name == "span" and child.get("id") and child.text.strip().endswith("部"):
-                    current_section = child.text.strip()
-                    print("Found Section:", current_section)
-                elif child.name == "br":
-                    pass
-                elif type(child) == bs4.element.NavigableString:
-                    pass
-
-parse_html("epub_extracted/OPS/c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
--- a/初学记/test_extract2.py
+++ b/初学记/test_extract2.py
@@ -1,54 +0,0 @@
-import bs4
-import re
-import os
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-
-    events = []
-
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    if current_line:
-                        events.append(("line", "".join(current_line).strip()))
-                        current_line = []
-                elif (
-                    child.name == "span"
-                    and child.get("id")
-                    and child.text.strip().endswith("部")
-                ):
-                    events.append(("section", child.text.strip()))
-                elif child.name == "small":
-                    # Convert small text to brackets
-                    small_text = child.get_text()
-                    # It might already have brackets inside.
-                    if not small_text.startswith("〈") and not small_text.startswith(
-                        "（"
-                    ):
-                        current_line.append(f"（{small_text}）")
-                    else:
-                        # Sometimes it has pseudo-brackets like `<span style="color:transparent;font-size:0px">〈</span>叙事...`
-                        # Let's clean the text up. We should just take the clean text and if it doesn't have brackets naturally (or we just process it to be clean).
-                        # Let's look at child.get_text()
-                        current_line.append(small_text)
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                events.append(("line", "".join(current_line).strip()))
-    return events
-
-
-events = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-for e in events[:30]:
-    print(e)
--- a/初学记/test_extract3.py
+++ b/初学记/test_extract3.py
@@ -1,127 +0,0 @@
-import bs4
-import os
-import re
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-
-    events = []
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    if current_line:
-                        events.append(("line", "".join(current_line).strip()))
-                        current_line = []
-                elif (
-                    child.name == "span"
-                    and child.get("id")
-                    and child.text.strip().endswith("部")
-                ):
-                    events.append(("section", child.text.strip()))
-                elif child.name == "small":
-                    small_text = child.get_text()
-                    if not small_text.startswith("〈") and not small_text.startswith(
-                        "（"
-                    ):
-                        current_line.append(f"〈{small_text}〉")
-                    else:
-                        current_line.append(small_text)
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                events.append(("line", "".join(current_line).strip()))
-    return events
-
-
-def extract_sections(entry_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    narrative_match = re.search(r"〈(叙事|敘事)〉(.*)", entry_text)
-    if not narrative_match:
-        return result
-
-    rest_after_narrative = narrative_match.group(2)
-
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
-
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟)"
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    for k in result:
-        result[k] = result[k].replace("〈", "（").replace("〉", "）")
-
-    return result
-
-
-events = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-
-entries = []
-current_entry_text = []
-
-for ev_type, text in events:
-    if ev_type == "section":
-        pass
-    else:
-        if "〈叙事〉" in text or "〈敘事〉" in text:
-            if current_entry_text:
-                full_text = "".join(current_entry_text)
-                match = re.search(
-                    r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉",
-                    full_text,
-                )
-                if match:
-                    entry_name = match.group(1).strip()
-                    sections = extract_sections(full_text)
-                    entries.append((entry_name, sections))
-            current_entry_text = [text]
-        else:
-            current_entry_text.append(text)
-
-if current_entry_text:
-    full_text = "".join(current_entry_text)
-    match = re.search(
-        r"^(.*?)(?:第[一二三四五六七八九十百]+(?:[上下])?)?〈(?:叙事|敘事)〉", full_text
-    )
-    if match:
-        entry_name = match.group(1).strip()
-        sections = extract_sections(full_text)
-        entries.append((entry_name, sections))
-
-for entry_name, sections in entries[:3]:
-    print("Entry:", entry_name)
-    print("叙事:", sections["叙事"][:50])
-    print("事对:", sections["事对"][:50])
-    print("诗文:", sections["诗文"][:50])
-    print("-" * 20)
--- a/初学记/test_parse.py
+++ b/初学记/test_parse.py
@@ -1,35 +0,0 @@
-import bs4
-import os
-import json
-import re
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-    texts = []
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            # We want to process the text inside <p> while respecting <br/> as separators.
-            # But actually, inside a <p>, there are text nodes, <span>, <small>, <br/>, etc.
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    texts.append("".join(current_line).strip())
-                    current_line = []
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                texts.append("".join(current_line).strip())
-    return texts
-
-
-texts = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-for i, t in enumerate(texts[:50]):
-    print(f"{i}: {t}")
--- a/初学记/test_parse2.py
+++ b/初学记/test_parse2.py
@@ -1,93 +0,0 @@
-import bs4
-import os
-import json
-import re
-
-html_dir = "epub_extracted/OPS"
-
-
-def parse_html(filepath):
-    with open(filepath, "r", encoding="utf-8") as f:
-        soup = bs4.BeautifulSoup(f.read(), "html.parser")
-
-    poem_divs = soup.find_all("div", class_="poem")
-    texts = []
-    for div in poem_divs:
-        for p in div.find_all("p"):
-            current_line = []
-            for child in p.children:
-                if child.name == "br":
-                    texts.append("".join(current_line).strip())
-                    current_line = []
-                else:
-                    current_line.append(child.get_text())
-            if current_line:
-                texts.append("".join(current_line).strip())
-    return texts
-
-
-def extract_sections(entry_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    # Extract 叙事
-    narrative_match = re.search(r"〈叙事〉(.*)", entry_text)
-    if not narrative_match:
-        return result
-
-    rest_after_narrative = narrative_match.group(1)
-
-    # Find 事对
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]  # skip "事對"
-
-        # Find 诗文 start
-        # Match 〉 followed by a literary genre
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1  # keep the genre character
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        # No 事对
-        genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛)"
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    return result
-
-
-texts = parse_html(
-    os.path.join(html_dir, "c1_chu_xue_ji__si_ku_quan_shu_ben__juan01.xhtml")
-)
-entries = []
-for line in texts:
-    if "〈叙事〉" in line:
-        entry_name = line.split("〈叙事〉")[0]
-        # remove "第X" from entry_name
-        word_name = re.sub(
-            r"第[一二三四五六七八九十百]+(?:[上下])?$", "", entry_name
-        ).strip()
-        sections = extract_sections(line)
-        entries.append((word_name, entry_name, sections))
-
-for e in entries[:3]:
-    print(f"Word: {e[0]}")
-    print(f"Entry: {e[1]}")
-    print(f"叙事 len: {len(e[2]['叙事'])}")
-    print(f"事对 len: {len(e[2]['事对'])}")
-    print(f"诗文 len: {len(e[2]['诗文'])}")
-    print("-" * 20)
--- a/初学记/test_split.py
+++ b/初学记/test_split.py
@@ -1,41 +0,0 @@
-import re
-
-def extract_sections(content_text):
-    result = {"叙事": "", "事对": "", "诗文": ""}
-
-    rest_after_narrative = content_text
-
-    shìduì_start = rest_after_narrative.find("事對")
-    if shìduì_start == -1:
-        shìduì_start = rest_after_narrative.find("事对")
-
-    genre_pattern = r"〉(賦|詩|讚|表|碑|頌|銘|檄|文|啓|書|歌|曲|引|記|箴|七|連珠|弔|祭文|詔|令|誄|序|論|賛|讃|啟|辭|辞|操|對|对|銘|論)"
-
-    if shìduì_start != -1:
-        result["叙事"] = rest_after_narrative[:shìduì_start]
-        rest_after_shiduì = rest_after_narrative[shìduì_start + 2 :]
-
-        shiwen_match = re.search(genre_pattern, rest_after_shiduì)
-
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["事对"] = rest_after_shiduì[:split_idx]
-            result["诗文"] = rest_after_shiduì[split_idx:]
-        else:
-            result["事对"] = rest_after_shiduì
-    else:
-        shiwen_match = re.search(genre_pattern, rest_after_narrative)
-        if shiwen_match:
-            split_idx = shiwen_match.start() + 1
-            result["叙事"] = rest_after_narrative[:split_idx]
-            result["诗文"] = rest_after_narrative[split_idx:]
-        else:
-            result["叙事"] = rest_after_narrative
-
-    for k in result:
-        result[k] = result[k].replace("〈", "（").replace("〉", "）")
-
-    return result
-
-import json
-print(extract_sections("這里是叙事事對這裡是事對〉詩這里是詩文"))