initial commit

2020-08-06 13:32:21 -04:00
commit 86eb38d053
13 changed files with 524 additions and 0 deletions
--- a/tools/patch.py
+++ b/tools/patch.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python2.7
+t = open("mobi/__init__.py",'r').read().replace(
+	"uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])",
+	"result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+2]['record Data Offset']-self.config['mobi']['extra bytes']])"
+).replace(
+	"for record in range(1, self.config['mobi']['First Non-book index'] - 1):",
+	"for record in range(0, 10000000,1):"
+)
+open("mobi/__init__.py",'w').write(t)
+t = open("mobi/lz77.py",'r').read().replace(
+	"print(\"WARNING:","#print(\"WARNING:"
+).replace(
+	"\" beginning of text!",
+	"#"
+)
+open("mobi/lz77.py",'w').write(t)
--- a/tools/to_json.py
+++ b/tools/to_json.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+import os
+import re
+import io
+import sys 
+import json 
+
+reload(sys)  
+sys.setdefaultencoding('utf8')
+
+txt = open("../zdic.txt",'r').read()
+ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
+ent = [(x[0].strip(),x[1]) for x in ent]
+ent = [x for x in ent if x[0] != ""]
+
+def first(l):
+    return "" if len(l) == 0 else l[0]
+
+def rem_bad_char(t):
+    return t.decode('utf-8','ignore')
+
+def heads_to_name(t):
+    u = [x for x in t if len(hex(ord(x)))-2 <= 4]
+    u = ("".join(u)).decode("utf-8","ignore")
+    #print u
+    return u
+
+def append_if_ok(heads,word):
+    try:
+        fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
+        open(fn,'wb').write("test.")
+        os.remove(fn)
+        heads.append(unicode(word)[0].lower())
+        return True
+    except:
+        print("BAD WORD NAME:",word)
+        return False
+
+heads = []
+result = {}
+weirdo = {}
+for e in ent:
+    
+    word = e[0]
+    content = e[1]
+    print word,
+    
+    is_ok = True
+
+    if len(heads) == 0:
+        heads = []
+        is_ok = append_if_ok(heads,word)
+
+    else:
+        try:
+            b = not (word.lower()).startswith(heads[-1].lower())
+        except:
+            continue
+        if b:
+            if len(result) > 1000 or len(heads) >= 128:
+                fn = heads_to_name(heads)
+                open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
+
+                result = {}
+                heads = []
+                is_ok = append_if_ok(heads,word)
+            else:
+                is_ok = append_if_ok(heads,word)
+
+
+    traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
+    pinyin = first(re.findall(r"拼音.*?：(.*?)[<\t　]",content))
+    zhuyin = first(re.findall(r"注音.*?：(.*?)<",content))
+    definition = re.findall(r"<li.*?>(.*?)</li>",content)
+    if len(definition) == 0:
+        definition = [content]
+
+    definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
+    definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
+    definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
+    #definition = [re.sub(r"【解释】：","",d).strip() for d in definition]
+    traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
+
+    definition = [rem_bad_char(d) for d in definition 
+        if  (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
+        and (len(zhuyin) == 0 or zhuyin not in d ) \
+        and "繁体" not in d
+        and "简体" not in d
+        and "郑码" not in d
+        and "拼音" not in d
+        and "粤语：" not in d
+        and "潮州话：" not in d
+        and "UniCode" not in d
+        and "◎" not in d
+        and len(d.replace(word,"")) > 0
+    ]
+
+    if len(definition) == 0:
+        definition = [rem_bad_char(content)]
+        definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
+
+
+    thing = {"TRD":traditional.decode('utf-8',"ignore"), 
+             "PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
+             "DEF":definition,
+            }
+    if is_ok:
+        result[word]=thing
+    else:
+        weirdo[word]=thing
+
+open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))
+
--- a/tools/to_txt.py
+++ b/tools/to_txt.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python2.7
+from mobi import Mobi
+
+book = Mobi("../zdic.prc");
+book.parse();
+
+for record in book:
+	print record,