Update: 初学记、佩文韵府 and 五车韵瑞
This commit is contained in:
69
佩文韵府/fix_pipes.py
Normal file
69
佩文韵府/fix_pipes.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
print("Loading peiwenyunfu.json...")
|
||||
with open('peiwenyunfu.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
prefixes = ["韻藻", "增", "増"]
|
||||
|
||||
def clean_headword(word):
|
||||
clean_word = word
|
||||
# Try stripping prefixes
|
||||
for _ in range(2): # In case there's "増韻藻" or something
|
||||
for p in prefixes:
|
||||
if clean_word.startswith(p) and len(clean_word) > len(p):
|
||||
clean_word = clean_word[len(p):]
|
||||
return clean_word
|
||||
|
||||
def replace_pipes_in_content(content, word):
|
||||
clean_word = clean_headword(word)
|
||||
word_len = len(clean_word)
|
||||
|
||||
if word_len == 0 or "丨" not in content:
|
||||
return content
|
||||
|
||||
def repl(match):
|
||||
nonlocal pipe_idx
|
||||
block = match.group(0)
|
||||
block_len = len(block)
|
||||
|
||||
if block_len % word_len == 0:
|
||||
# Full word match! Reset alignment.
|
||||
pipe_idx = 0
|
||||
return clean_word * (block_len // word_len)
|
||||
else:
|
||||
# Partial word match. Use current sequence.
|
||||
res = ""
|
||||
for _ in range(block_len):
|
||||
res += clean_word[pipe_idx % word_len]
|
||||
pipe_idx += 1
|
||||
return res
|
||||
|
||||
pipe_idx = 0
|
||||
return re.sub(r'丨+', repl, content)
|
||||
|
||||
print("Processing...")
|
||||
for rhyme, r_data in data.items():
|
||||
if rhyme in ["metadata", "preface"]:
|
||||
continue
|
||||
|
||||
# 1. Fix 小韵描述
|
||||
if "小韵描述" in r_data and r_data["小韵描述"]:
|
||||
# The placeholder should be replaced by the rhyme char
|
||||
# BUT wait! The rhyme char might be simplified in our dictionary keys!
|
||||
# The user's prompt used "东" for replacement in 小韵描述.
|
||||
# So we just use the dictionary key `rhyme`.
|
||||
r_data["小韵描述"] = r_data["小韵描述"].replace("丨", rhyme)
|
||||
|
||||
# 2. Fix 词条
|
||||
if "词条" in r_data:
|
||||
new_citiao = {}
|
||||
for word, content in r_data["词条"].items():
|
||||
new_citiao[word] = replace_pipes_in_content(content, word)
|
||||
r_data["词条"] = new_citiao
|
||||
|
||||
print("Saving peiwenyunfu.json...")
|
||||
with open('peiwenyunfu.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=4)
|
||||
print("Done!")
|
||||
Reference in New Issue
Block a user