文字拼音
下面是 \_gen_zodiac_docx.py 的完整内容:
python
# -*- coding: utf-8 -*-
"""Generate Word doc: story with per-character pinyin alignment."""
from __future__ import annotations
from pathlib import Path
from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm, Pt, RGBColor, Twips
from pypinyin import Style, pinyin
# —— 语文课本式注音课文(更紧凑)——
PY_FONT_PT = 7
HAN_FONT_PT = 11
PY_COLOR = RGBColor(0x5C, 0x5C, 0x5C) # 略浅于正文,常见注音色
HAN_FONT_EAST_ASIA = "KaiTi" # 楷体;无则 Word 会回退
HAN_FONT_LATIN = "Times New Roman"
PY_FONT_LATIN = "Times New Roman"
# 段首空两格(与字号大致匹配);分段时须从首行可用宽度中扣除,避免 Word 为塞回页内而压窄列
TEXTBOOK_BODY_INDENT_TWIPS = 380
# 与 _style_pinyin_cell / _style_han_cell 左右 tcMar 一致,并留少量防压缩余量
_CELL_LR_PAD_TWIPS = int(2 * Pt(0.15).twips) + 8
# 分段时若接近行尾,优先在这些标点后断开(仍保证每格一字 / 一拼音)
_BREAK_PREFER_AFTER = set(
",。!?、;:“”‘’()【】《》…—·,.!?:;\"'()[]"
)
def usable_body_twips(doc: Document) -> int:
"""版心可用宽度(twips),用于按列宽累积分段。"""
sec = doc.sections[0]
pw = int(sec.page_width.twips)
lm = int(sec.left_margin.twips)
rm = int(sec.right_margin.twips)
# 留出余量,避免表格总宽顶满版心时 Word 再按比例压列
return max(4800, pw - lm - rm - 160)
def column_width_twips(ch: str, py: str) -> int:
"""单列最小宽度(含左右内边距):与当前字号匹配,紧凑但仍防拼音压扁。"""
pad = _CELL_LR_PAD_TWIPS
if "\u4e00" <= ch <= "\u9fff":
han = 255
else:
han = 120
if py:
pinyin_w = max(365, 175 + len(py) * 80)
else:
pinyin_w = 145
return pad + max(han, pinyin_w)
def set_table_fixed_layout(table) -> None:
"""固定表格布局,禁止 Word 为塞满页宽而均分/压缩列宽。"""
table.autofit = False
tbl = table._tbl
tblPr = tbl.tblPr
if tblPr is None:
return
tbl_layout = tblPr.find(qn("w:tblLayout"))
if tbl_layout is None:
tbl_layout = OxmlElement("w:tblLayout")
tblPr.append(tbl_layout)
tbl_layout.set(qn("w:type"), "fixed")
def set_table_preferred_width_dxa(table, total_twips: int) -> None:
"""表格首选宽度 = 各列宽度之和,减少 Word 为对齐页边而均分压缩列宽。"""
tbl_pr = table._tbl.tblPr
if tbl_pr is None:
return
for el in list(tbl_pr):
if el.tag == qn("w:tblW"):
tbl_pr.remove(el)
tbl_w = OxmlElement("w:tblW")
tbl_w.set(qn("w:w"), str(max(1, int(total_twips))))
tbl_w.set(qn("w:type"), "dxa")
tbl_pr.append(tbl_w)
def set_cell_width_dxa(cell, twips: int) -> None:
tc_pr = cell._tc.get_or_add_tcPr()
tc_w = tc_pr.find(qn("w:tcW"))
if tc_w is None:
tc_w = OxmlElement("w:tcW")
tc_pr.append(tc_w)
tw = max(120, int(twips))
tc_w.set(qn("w:w"), str(tw))
tc_w.set(qn("w:type"), "dxa")
def sync_tbl_grid_col_widths(table, col_widths_twips: list[int]) -> None:
"""python-docx 新建表时 tblGrid 按「整段版心÷列数」均分,总宽恒为整页;列少时会把格拉到右边界。
将每个 w:gridCol/@w:w 改为与本列 tcW 一致,表总宽才等于内容累加宽。"""
grid_cols = table._tbl.tblGrid.gridCol_lst
if len(grid_cols) != len(col_widths_twips):
return
for gc, tw in zip(grid_cols, col_widths_twips):
gc.w = Twips(int(tw))
def set_table_no_border(table) -> None:
"""表格级:无边框(含内部线);去掉 tblStyle 以免套用「网格」类内置样式。"""
tbl = table._tbl
tblPr = tbl.tblPr
if tblPr is None:
tblPr = OxmlElement("w:tblPr")
tbl.insert(0, tblPr)
for el in list(tblPr):
if el.tag in (qn("w:tblBorders"), qn("w:tblStyle")):
tblPr.remove(el)
borders = OxmlElement("w:tblBorders")
for edge in ("top", "left", "bottom", "right", "insideH", "insideV"):
b = OxmlElement(f"w:{edge}")
b.set(qn("w:val"), "none")
b.set(qn("w:sz"), "0")
b.set(qn("w:space"), "0")
b.set(qn("w:color"), "auto")
borders.append(b)
tblPr.append(borders)
def set_cell_no_border(cell) -> None:
"""单元格级:四边 nil;部分 Word 版本对 none 更干净,双写 nil。"""
tc_pr = cell._tc.get_or_add_tcPr()
for old in list(tc_pr):
if old.tag == qn("w:tcBorders"):
tc_pr.remove(old)
tc_b = OxmlElement("w:tcBorders")
for edge in ("top", "left", "bottom", "right"):
b = OxmlElement(f"w:{edge}")
b.set(qn("w:val"), "none")
b.set(qn("w:sz"), "0")
b.set(qn("w:space"), "0")
b.set(qn("w:color"), "auto")
tc_b.append(b)
tc_pr.append(tc_b)
def clear_paragraph_border(paragraph) -> None:
"""去掉段落框线(少数模板会给表格内段落加 pBdr)。"""
p_el = paragraph._element
p_pr = p_el.pPr
if p_pr is None:
return
for child in list(p_pr):
if child.tag == qn("w:pBdr"):
p_pr.remove(child)
def finalize_table_no_borders(table) -> None:
"""样式与宽度都设完后再扫一遍,确保无任何表格/单元格/段落边框残留。"""
set_table_no_border(table)
for row in table.rows:
for cell in row.cells:
set_cell_no_border(cell)
for para in cell.paragraphs:
clear_paragraph_border(para)
def set_cell_margins(
cell,
top: int = 0,
bottom: int = 0,
left: int = 0,
right: int = 0,
) -> None:
"""单元格内边距(twips),拉开「上拼音—下汉字」的纵向节奏。"""
tc_pr = cell._tc.get_or_add_tcPr()
for old in list(tc_pr):
if old.tag == qn("w:tcMar"):
tc_pr.remove(old)
tc_mar = OxmlElement("w:tcMar")
for edge, w in (("top", top), ("left", left), ("bottom", bottom), ("right", right)):
el = OxmlElement(f"w:{edge}")
el.set(qn("w:w"), str(max(0, int(w))))
el.set(qn("w:type"), "dxa")
tc_mar.append(el)
tc_pr.append(tc_mar)
def set_table_indent(table, twips: int) -> None:
"""整张表左缩进,实现段首空两格。"""
tbl = table._tbl
tbl_pr = tbl.tblPr
if tbl_pr is None:
return
for el in list(tbl_pr):
if el.tag == qn("w:tblInd"):
tbl_pr.remove(el)
tbl_ind = OxmlElement("w:tblInd")
tbl_ind.set(qn("w:w"), str(int(twips)))
tbl_ind.set(qn("w:type"), "dxa")
tbl_pr.append(tbl_ind)
def strip_run_horizontal_fit(run) -> None:
"""去掉「适应表格 / 挤压」类属性,避免拼音被横向压扁。"""
r_pr = run._element.rPr
if r_pr is None:
return
for el in list(r_pr):
if el.tag == qn("w:fitText"):
r_pr.remove(el)
def set_cell_no_wrap(cell) -> None:
"""One line per cell — pinyin does not wrap inside the cell."""
tcPr = cell._tc.get_or_add_tcPr()
for child in tcPr:
if child.tag == qn("w:noWrap"):
return
tcPr.append(OxmlElement("w:noWrap"))
def char_pinyin_list(s: str) -> tuple[list[str], list[str]]:
out_ch: list[str] = []
out_py: list[str] = []
i = 0
n = len(s)
while i < n:
ch = s[i]
if "\u4e00" <= ch <= "\u9fff":
j = i + 1
while j < n and "\u4e00" <= s[j] <= "\u9fff":
j += 1
seg = s[i:j]
py_list = pinyin(seg, style=Style.TONE, heteronym=False)
for k, c in enumerate(seg):
out_ch.append(c)
out_py.append(py_list[k][0] if py_list[k] else "")
i = j
else:
out_ch.append(ch)
out_py.append("")
i += 1
return out_ch, out_py
def split_into_width_chunks(
chars: list[str],
pys: list[str],
budget_full_twips: int,
indent_reserve_twips: int,
) -> list[tuple[list[str], list[str]]]:
"""从左往右累加每字列宽;总宽超版心则换行(下一块表)。短音节列窄、长音节列宽,同一行可排更多字。"""
n = len(chars)
chunks: list[tuple[list[str], list[str]]] = []
i = 0
first_row_of_paragraph = True
while i < n:
line_budget = budget_full_twips - (
indent_reserve_twips if first_row_of_paragraph else 0
)
line_budget = max(2200, line_budget)
acc = 0
end_exc = i
while end_exc < n:
w = column_width_twips(chars[end_exc], pys[end_exc])
if acc + w > line_budget and end_exc > i:
break
acc += w
end_exc += 1
if end_exc == i:
end_exc = i + 1
if end_exc < n:
lookback = min(12, end_exc - i)
cut = end_exc
for k in range(end_exc - 1, max(i, end_exc - lookback) - 1, -1):
if chars[k] in _BREAK_PREFER_AFTER:
cut = k + 1
break
if cut > i:
end_exc = cut
chunk_ch = chars[i:end_exc]
chunk_py = pys[i:end_exc]
while len(chunk_ch) > 1:
row_w = sum(
column_width_twips(c, p) for c, p in zip(chunk_ch, chunk_py)
)
if row_w <= line_budget:
break
chunk_ch, chunk_py = chunk_ch[:-1], chunk_py[:-1]
end_exc -= 1
if chunk_ch:
chunks.append((list(chunk_ch), list(chunk_py)))
i = end_exc
first_row_of_paragraph = False
return chunks
def _style_pinyin_cell(cell) -> None:
t = 0
b = int(Pt(1).twips)
lr = int(Pt(0.15).twips)
set_cell_margins(cell, top=t, bottom=b, left=lr, right=lr)
for para in cell.paragraphs:
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
para.paragraph_format.space_before = Pt(0)
para.paragraph_format.space_after = Pt(0)
para.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
para.paragraph_format.line_spacing = Pt(PY_FONT_PT + 1)
for run in para.runs:
run.font.size = Pt(PY_FONT_PT)
run.font.name = PY_FONT_LATIN
run.font.color.rgb = PY_COLOR
run._element.rPr.rFonts.set(qn("w:eastAsia"), PY_FONT_LATIN)
strip_run_horizontal_fit(run)
def _style_han_cell(cell) -> None:
t = 0
b = int(Pt(2.5).twips)
lr = int(Pt(0.15).twips)
set_cell_margins(cell, top=t, bottom=b, left=lr, right=lr)
for para in cell.paragraphs:
para.alignment = WD_ALIGN_PARAGRAPH.CENTER
para.paragraph_format.space_before = Pt(0)
para.paragraph_format.space_after = Pt(0)
para.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
para.paragraph_format.line_spacing = Pt(HAN_FONT_PT + 2)
for run in para.runs:
run.font.size = Pt(HAN_FONT_PT)
run.font.bold = False
run.font.name = HAN_FONT_LATIN
run.font.color.rgb = RGBColor(0x00, 0x00, 0x00)
run._element.rPr.rFonts.set(qn("w:eastAsia"), HAN_FONT_EAST_ASIA)
strip_run_horizontal_fit(run)
def add_two_row_table(
chars: list[str],
pys: list[str],
add_trailing_space: bool,
doc: Document,
apply_paragraph_indent: bool,
) -> None:
table = doc.add_table(rows=2, cols=len(chars))
try:
table.style = None
except (KeyError, ValueError, AttributeError):
pass
set_table_no_border(table)
set_table_fixed_layout(table)
# 表宽 = 本行各列之和,短行不拉满版心;靠左排,自然截至(不居中撑满)
table.alignment = WD_TABLE_ALIGNMENT.LEFT
if apply_paragraph_indent:
set_table_indent(table, TEXTBOOK_BODY_INDENT_TWIPS)
col_widths: list[int] = []
for j, (c, p) in enumerate(zip(chars, pys)):
c0 = table.rows[0].cells[j]
c1 = table.rows[1].cells[j]
set_cell_no_wrap(c0)
set_cell_no_wrap(c1)
c0.text = p
c1.text = c
col_tw = column_width_twips(c, p)
col_widths.append(col_tw)
set_cell_width_dxa(c0, col_tw)
set_cell_width_dxa(c1, col_tw)
_style_pinyin_cell(c0)
_style_han_cell(c1)
sync_tbl_grid_col_widths(table, col_widths)
row_total = sum(col_widths)
set_table_preferred_width_dxa(table, row_total)
finalize_table_no_borders(table)
if add_trailing_space:
tail = doc.add_paragraph()
tail.paragraph_format.space_after = Pt(2)
tail.paragraph_format.space_before = Pt(0)
def add_paragraph_table(doc: Document, text: str) -> None:
text = text.rstrip("\n")
if not text.strip():
gap = doc.add_paragraph()
gap.paragraph_format.space_before = Pt(0)
gap.paragraph_format.space_after = Pt(1)
gap.paragraph_format.line_spacing = Pt(1)
return
chars, pys = char_pinyin_list(text)
budget = usable_body_twips(doc)
chunks = split_into_width_chunks(
chars,
pys,
budget,
TEXTBOOK_BODY_INDENT_TWIPS,
)
for idx, (ch_sub, py_sub) in enumerate(chunks):
is_last_chunk = idx == len(chunks) - 1
add_two_row_table(
ch_sub,
py_sub,
is_last_chunk,
doc,
apply_paragraph_indent=(idx == 0),
)
def main() -> None:
paragraphs = [
"很久很久以前,人们总记不住自己的生日。",
"玉皇大帝想了个好办法,举办一场渡河比赛,选出十二只动物来代表年份。",
"",
"比赛那天,小老鼠故意骗猫说比赛还没开始,猫就安心睡觉了。",
"老鼠悄悄爬到牛背上,老牛力气大,很快就游到了对岸。",
"快到终点时,老鼠“嗖”地一跳,得了第一名。",
"",
"接下来,牛、老虎、兔子、龙、蛇、马、羊、猴、鸡、狗、猪,都顺利到达终点。",
"玉皇大帝笑着说:“你们就是十二生肖啦!”",
"",
"等猫睡醒匆匆赶来,名额已经满了。",
"猫又生气又难过,从此一见到老鼠就追。老鼠也就一直害怕猫啦。",
]
doc = Document()
for sec in doc.sections:
sec.left_margin = Cm(1.75)
sec.right_margin = Cm(1.75)
sec.top_margin = Cm(1.45)
sec.bottom_margin = Cm(1.45)
title_p = doc.add_paragraph()
title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_p.paragraph_format.space_after = Pt(4)
title_p.paragraph_format.space_before = Pt(0)
tr = title_p.add_run("十二生肖的故事")
tr.font.size = Pt(16)
tr.font.bold = True
tr.font.name = "SimHei"
tr._element.rPr.rFonts.set(qn("w:eastAsia"), "SimHei")
sub_p = doc.add_paragraph()
sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
sub_p.paragraph_format.space_after = Pt(3)
sr = sub_p.add_run("(注音朗读)")
sr.font.size = Pt(8)
sr.font.color.rgb = RGBColor(0x66, 0x66, 0x66)
sr.font.name = PY_FONT_LATIN
sr._element.rPr.rFonts.set(qn("w:eastAsia"), HAN_FONT_EAST_ASIA)
for block in paragraphs:
if block == "":
gap = doc.add_paragraph()
gap.paragraph_format.space_before = Pt(0)
gap.paragraph_format.space_after = Pt(1)
gap.paragraph_format.line_spacing = Pt(1)
continue
add_paragraph_table(doc, block)
out = Path(__file__).resolve().parent / "十二生肖故事-汉字拼音对照表.docx"
try:
doc.save(out)
except PermissionError:
alt = out.with_name(out.stem + "-未占用时覆盖主文件.docx")
doc.save(alt)
print("saved (Word 占用主文件):", alt)
else:
print("saved:", out)
if __name__ == "__main__":
main()