scel2text.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. """
  2. 搜狗细胞词库转鼠须管(Rime)词库
  3. 搜狗的 scel 词库是按照一定格式保存的 Unicode 编码文件,其中每两个字节表示一个字符(中文汉字或者英文字母),主要两部分:
  4. 1. 全局拼音表,在文件中的偏移值是 0x1540+4, 格式为 (py_idx, py_len, py_str)
  5. - py_idx: 两个字节的整数,代表这个拼音的索引
  6. - py_len: 两个字节的整数,拼音的字节长度
  7. - py_str: 当前的拼音,每个字符两个字节,总长 py_len
  8. 2. 汉语词组表,在文件中的偏移值是 0x2628 或 0x26c4, 格式为 (word_count, py_idx_count, py_idx_data, (word_len, word_str, ext_len, ext){word_count}),其中 (word_len, word, ext_len, ext){word_count} 一共重复 word_count 次, 表示拼音的相同的词一共有 word_count 个
  9. - word_count: 两个字节的整数,同音词数量
  10. - py_idx_count: 两个字节的整数,拼音的索引个数
  11. - py_idx_data: 两个字节表示一个整数,每个整数代表一个拼音的索引,拼音索引数
  12. - word_len:两个字节的整数,代表中文词组字节数长度
  13. - word_str: 汉语词组,每个中文汉字两个字节,总长度 word_len
  14. - ext_len: 两个字节的整数,可能代表扩展信息的长度,好像都是 10
  15. - ext: 扩展信息,一共 10 个字节,前两个字节是一个整数(不知道是不是词频),后八个字节全是 0,ext_len 和 ext 一共 12 个字节
  16. 参考资料
  17. 1. https://raw.githubusercontent.com/archerhu/scel2mmseg/master/scel2mmseg.py
  18. 2. https://raw.githubusercontent.com/xwzhong/small-program/master/scel-to-txt/scel2txt.py
  19. """
  20. import struct
  21. import os
  22. import sys
  23. def read_utf16_str(f, offset=-1, len=2):
  24. if offset >= 0:
  25. f.seek(offset)
  26. string = f.read(len)
  27. return string.decode('UTF-16LE')
  28. def read_uint16(f):
  29. return struct.unpack('<H', f.read(2))[0]
  30. def get_hz_offset(f):
  31. mask = f.read(128)[4]
  32. if mask == 0x44:
  33. return 0x2628
  34. elif mask == 0x45:
  35. return 0x26c4
  36. else:
  37. print("不支持的文件类型(无法获取汉语词组的偏移量)")
  38. sys.exit(1)
  39. def get_dict_meta(f):
  40. title = read_utf16_str(f, 0x130, 0x338 - 0x130)
  41. category = read_utf16_str(f, 0x338, 0x540 - 0x338)
  42. desc = read_utf16_str(f, 0x540, 0xd40 - 0x540)
  43. samples = read_utf16_str(f, 0xd40, 0x1540 - 0xd40)
  44. return title, category, desc, samples
  45. def get_py_map(f):
  46. py_map = {}
  47. f.seek(0x1540 + 4)
  48. while True:
  49. py_idx = read_uint16(f)
  50. py_len = read_uint16(f)
  51. py_str = read_utf16_str(f, -1, py_len)
  52. if py_idx not in py_map:
  53. py_map[py_idx] = py_str
  54. # 如果拼音为 zuo,说明是最后一个了
  55. if py_str == 'zuo':
  56. break
  57. return py_map
  58. def get_records(f, file_size, hz_offset, py_map):
  59. f.seek(hz_offset)
  60. records = []
  61. while f.tell() != file_size:
  62. word_count = read_uint16(f)
  63. py_idx_count = int(read_uint16(f) / 2)
  64. py_set = []
  65. for i in range(py_idx_count):
  66. py_idx = read_uint16(f)
  67. if (py_map.get(py_idx, None) == None):
  68. return records
  69. py_set.append(py_map[py_idx])
  70. py_str = " ".join(py_set)
  71. for i in range(word_count):
  72. word_len = read_uint16(f)
  73. word_str = read_utf16_str(f, -1, word_len)
  74. # 跳过 ext_len 和 ext 共 12 个字节
  75. f.read(12)
  76. records.append((py_str, word_str))
  77. return records
  78. def get_words_from_sogou_cell_dict(fname):
  79. with open(fname, 'rb') as f:
  80. hz_offset = get_hz_offset(f)
  81. (title, category, desc, samples) = get_dict_meta(f)
  82. # print("title: %s\ncategory: %s\ndesc: %s\nsamples: %s" %
  83. # (title, category, desc, samples))
  84. py_map = get_py_map(f)
  85. file_size = os.path.getsize(fname)
  86. words = get_records(f, file_size, hz_offset, py_map)
  87. return words
  88. def save(records, f):
  89. records_translated = list(map(lambda x: "%s\t%s" % (
  90. x[1], x[0]), records))
  91. f.write("\n".join(records_translated))
  92. return records_translated
  93. def main():
  94. # 将要转换的词库添加在 scel 目录下
  95. scel_files = list(filter(lambda x: x.endswith('.scel'), [
  96. i for i in os.listdir("./scel")]))
  97. dict_file = "luna_pinyin.sogou.dict.yaml"
  98. dict_file_content = []
  99. dict_file_header = """# Rime dictionary
  100. # encoding: utf-8
  101. #
  102. # Sogou Pinyin Dict - 搜狗细胞词库
  103. #
  104. # https://pinyin.sogou.com/dict/
  105. #
  106. # 包括:
  107. #
  108. %s
  109. #
  110. ---
  111. name: luna_pinyin.sogou
  112. version: "1.0"
  113. sort: by_weight
  114. use_preset_vocabulary: true
  115. ...
  116. """
  117. sougo_dict_name_list = list(
  118. map(lambda x: "# * %s" % x.replace(".scel", ""), scel_files))
  119. dict_file_content.append(dict_file_header % "\n".join(sougo_dict_name_list))
  120. if not os.path.exists("./out"):
  121. os.mkdir("./out")
  122. for scel_file in scel_files:
  123. records = get_words_from_sogou_cell_dict(
  124. os.path.join("./scel", scel_file))
  125. print("%s: %s 个词" % (scel_file, len(records)))
  126. with open(os.path.join("./out", scel_file.replace(".scel", ".txt")), "w") as fout:
  127. dict_file_content.extend(save(records, fout))
  128. print("-" * 80)
  129. print("合并后 %s: %s 个词" % (dict_file, len(dict_file_content) - 1))
  130. with open(os.path.join("./out", dict_file), "w") as dictfout:
  131. dictfout.write("\n".join(dict_file_content))
  132. if __name__ == "__main__":
  133. main()