initial commit

2020-08-06 13:32:21 -04:00
commit 86eb38d053
13 changed files with 524 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.DS_Store
+*/DS_Store
+zdic.prc
+zdic.txt
+zdic_json/*
+tools/mobi/*
+zdic-cli-*
--- a/README.md
+++ b/README.md
@@ -0,0 +1,88 @@
+# zdic-cli
+
+An alternative, offline, regex-supporting, command-line interface to [zdic (漢典)](zdic.net), featuring:
+
+- No internet connection required
+- Full text search with regex: a command for finding characters/phrases in body text of definitions.
+- Colorful text for highlighting entries
+
+
+![](screenshots/screen000.png)
+![](screenshots/screen001.png)
+![](screenshots/screen002.png)
+![](screenshots/screen003.png)
+![](screenshots/screen004.png)
+
+
+## Downloads
+
+Standalone version available for windows, mac and linux. Please refer to the Releases page.
+
+## Usage
+
+zdic-cli runs as a REPL loop, and there are 5 types of commands you can enter:
+
+```
+def 甲        display entry for 甲
+pre 甲        list words that starts with 甲
+has 甲 乙 ... list words that contains 甲 and 乙 ...
+txt 甲        list words whose full entry text contains 甲 (regex supported)
+sel n         display entry at previously returned list index n
+```
+
+`def` is probably the most common one for simply looking up characters/words. `pre` `has` `txt` will return an enumerated list listing all elligible entries, and `sel 0`/`sel 1`/`sel n` can be used to select from the list.
+
+
+## Development setup
+
+**This section is for compiling from source (and is somewhat complex), if you just would like to use the software, please check out the Releases page.**
+
+### Dependencies
+
+- node.js/npm
+- python2. Tested on 2.7. It has to be python 2 instead of 3 because much of the data-processing work were done way back in a time when 2 was the norm. Sorry folks, but if you're a python3 purist feel free to send a PR!
+- pkg (optional, to build standalone binaries for multiple platforms) `npm install -g pkg`
+
+### Downloading and compiling the dictionary files
+
+This repo does not contain the dictonary files themselves as they're too large. Instead, a shell script (`setup.sh`) is provided to automate the process:
+
+- Automatically download the original Kindle dictionary format (.PRC) from the internet. The download link hardcoded in `setup.sh` might fail in the future, in which case a google search for `汉典.prc` should yield alternative resources.
+- Automatically download a python library (kroo/mobi-python) for parsing mobi files. A rather old library and have some oddities, a find-and-replace script will be automatically run to patch some glitches in the source code :P
+- `python/to_txt.py` is run to extract a raw `txt` file from the kindle format `prc`.
+- `phthon/to_json.py` is run to generate a directory of `json` files from the `txt` to make lookup and formatting more efficient.
+
+Run the shell script with:
+
+```
+sh setup.sh
+```
+
+At this point you'll have `zdic.prc`, `zdic.txt` and directory `zdic_json/`. Only `zdic_json` is needed, so if everything went well with the script, you can freely delete the other two and gain some 700MB of free space :)
+
+### Compiling the binary
+
+At this point you can also run the software by simply doing:
+
+```
+node index.js
+```
+
+You can also package it into a binary using
+
+```
+pkg .
+```
+
+provided you have the node and pkg dependencies installed.
+
+Tip: to run the software by typing `zdic` anywhere, you can symlink it to `/user/bin`, e.g.
+
+```
+ln -s /user/bin/zdic path/to/zdic-cli/zdic-cli
+```
+
+
+
+
+
--- a/index.js
+++ b/index.js
@@ -0,0 +1,259 @@
+const fs = require("fs")
+const readline = require('readline');
+
+const dict_path = __dirname+'/zdic_json'
+
+function locate_file(word){
+  const folder = dict_path;
+  var files = fs.readdirSync(folder)
+  for (var i = 0; i < files.length; i++){
+    if (files[i].includes(word.slice(0,1))){
+      return files[i]
+    }
+  }
+  return "extended.json"
+}
+
+function read_json(file_path){
+  return JSON.parse(fs.readFileSync(dict_path+"/"+file_path).toString())
+}
+
+function define(word){
+  var dict_path = locate_file(word)
+  var dict = read_json(dict_path)
+  var ret = []
+  if (word in dict){
+    return [word,dict[word]]
+  }
+  return null;
+}
+
+function starts_with(word){
+  var ret = []
+  var dict = read_json(locate_file(word))
+  for (w in dict){
+    if (w.startsWith(word)){
+      ret.push(w)
+    }
+  }
+  return ret;
+}
+
+function contains(words){
+  // console.log(words)
+  var ret = []
+  var files = fs.readdirSync(dict_path)
+  for (var i = 0; i < files.length; i++){
+    if (!files[i].endsWith(".json")){
+      continue;
+    }
+    var dict = read_json(files[i]);
+    for (var w in dict){
+      var ok = true;
+      for (var j = 0; j < words.length; j++){
+        if (!w.includes(words[j])){
+          ok = false;
+          break;
+        }
+      }
+      if (ok){
+        ret.push(w);
+      }
+    }
+  }
+  return ret;
+}
+
+function full_text_search(word,view_len,callback){
+  var ret = []
+  var pd = Math.floor((view_len-word.length)/2);
+  var files = fs.readdirSync(dict_path)
+  var re = new RegExp(word);
+  for (var i = 0; i < files.length; i++){
+    if (!files[i].endsWith(".json")){
+      continue;
+    }
+    var dict = read_json(files[i]);
+    for (var w in dict){
+      for (var j = 0; j < dict[w]['DEF'].length; j++){
+        var idx = dict[w]['DEF'][j].search(re)
+        if (idx == -1){
+          continue;
+        }
+        var it = [ret.length,w,dict[w]['DEF'][j].slice(Math.max(idx-pd,0),idx+word.length+pd)];
+        ret.push(w);
+        callback(it);
+        break;
+      }
+    }
+  }
+  return ret;
+}
+
+function render_def(word,entry){
+  var result = ""
+  result += "\x1b[32m\033[1m"+word+"\x1b[0m"
+  if (entry['TRD'] != ''){
+    result += " \x1b[32m\033[1m("+entry['TRD']+")\x1b[0m "
+  }
+  if (entry['PRN'][0] != ''){
+    result += " \x1b[33m[ "+entry['PRN'][0].trim()+""
+  }
+  if (entry['PRN'][1] != ''){
+    result += " , "+entry['PRN'][1].trim()
+  }
+  if (entry['PRN'][0] != ''){
+    result += " ]\x1b[0m"
+  }
+  result += "\n"
+  var n = process.stdout.columns-4;
+
+  for (var i = 0; i < entry['DEF'].length; i++){
+    result += "\x1b[2m〇\x1b[0m"
+    var j = 1;
+    var t = entry['DEF'][i];
+    var isf = true;
+    for (var c of t){
+      if (c == "《"){
+        result += "\x1b[31m"
+      }
+      if (c == "～"){
+        result += "\x1b[33m"+c+"\x1b[0m";
+        if (!isf){
+          result += "\x1b[2m"
+        }
+      }else{
+        result += c;
+      }
+      
+      if (c == "》"){
+        result += "\x1b[0m"
+        if (!isf){
+          result += "\x1b[2m"
+        }
+      }
+      if (c == "。" && isf){
+        result += "\x1b[2m"
+        isf = false;
+      }
+      j+=2;
+      if (j >= n){
+        j = 2;
+        result += "\n　　"
+
+      }
+    }
+    result+="\x1b[0m\n"
+  }
+  result += ""
+  return result
+}
+
+function char_pad(x,n){
+  return (x+"　".repeat(n)).slice(0,n);
+}
+function render_item(i,x,n){
+  return `\x1b[33m${i.toString().padStart(4)}\x1b[0m ${char_pad(x,n)}`;
+}
+
+function render_list(lst){
+  var npi = 6;
+  var npl = Math.floor((process.stdout.columns-2)/(npi*2+5))
+  var result = "";
+  for (var i = 0; i < lst.length; i++){
+    var li = lst[i];
+    li = li.replace(/[^一-鿿]/g,"")
+    if (!li.length){//oops
+      li = lst[i];
+    }
+    result += render_item(i,li,npi);
+    if (i % npl == npl-1){
+      result += "\n"
+    }else{
+      result += ""
+    }
+  }
+  return result;
+}
+
+
+var commands = {
+  "def":["def x      ","display entry for x"],
+  "pre":["pre x      ","list words that starts with x"],
+  "has":["has x y ...","list words that contains x and y ..."],
+  "txt":["txt x      ","list words whose full entry text contains x (regex supported)"],
+  "sel":["sel n      ","display entry at previously returned list index n"],
+}
+
+function main(prev){
+  var curr = null;
+  var def = null;
+  var fail = false;
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout
+  });
+  rl.question('> ', (answer) => {
+    try{
+      answer = answer.trim();
+      var cmd = answer.slice(0,3);
+      var arg = answer.slice(4);
+      if (!arg.length){
+        console.log(`\x1b[36m${commands[cmd][0]} \x1b[0m${commands[cmd][1]}`);
+      }else{
+        if (cmd == "def"){
+          var def = define(arg);
+          if (def){
+            console.log(render_def(...def));
+          }
+        }else if (cmd == "pre"){
+          curr = starts_with(arg);
+          console.log(render_list(curr));
+        }else if (cmd == "has"){
+          curr = contains(arg.split(" "));
+          console.log(render_list(curr));
+        }else if (cmd == "txt"){
+          curr = full_text_search(arg,Math.floor((process.stdout.columns-20)/2),function(x){
+            console.log(`${render_item(x[0],x[1],6)}\x1b[2m${x[2]}\x1b[0m`);
+          });
+        }else if (cmd == "sel"){
+          var def = define(prev[parseInt(arg)]);
+          if (def){
+            console.log(render_def(...def))
+          }
+        }else{
+          console.log("\x1b[31munsupported command.\x1b[0m")
+          fail = true
+        }
+        if (!fail && (def == null && (curr == null || curr.length == 0))){
+          console.log("\x1b[2m(0 result returned)\x1b[0m")
+        }
+      }
+    }catch(e){
+      console.log("\x1b[31mcommand parse failed. trying as direct query...\x1b[0m")
+      // console.log(e)
+      try{
+        var def = define(answer);
+        if (def){
+          console.log(render_def(...def));
+        }else{
+          throw new Error();
+        }
+      }catch(ee){
+        console.log("\x1b[31mcommand parse totally failed.\x1b[0m")
+      }
+    }
+    rl.close()
+    main(curr||prev);
+  });
+  
+}
+
+console.log("╔═════════════════════════════════════════════╗")
+console.log("║\x1b[31m               漢    典   CLI                \x1b[0m║")
+console.log("║Unoffical offline 漢典 (zdic.net) commandline║")
+console.log("║\x1b[2m w/ data derived from `汉典.prc` (for Kindle)\x1b[0m║")
+console.log("║\x1b[2m            Lingdong Huang 2020              \x1b[0m║");
+console.log("╚═════════════════════════════════════════════╝")
+console.log(`commands: ${Object.keys(commands).map(x=>( "\x1b[36m"+x+"\x1b[0m" )).join(",") }, run without arguments to see help`);
+main();
--- a/package.json
+++ b/package.json
@@ -0,0 +1,7 @@
+{
+	"name":"zdic-cli",
+	"pkg": {
+		"assets": "zdic_json/*"
+	},
+	"bin":"index.js"
+}
--- a/screenshots/screen000.png
+++ b/screenshots/screen000.png
--- a/screenshots/screen001.png
+++ b/screenshots/screen001.png
--- a/screenshots/screen002.png
+++ b/screenshots/screen002.png
--- a/screenshots/screen003.png
+++ b/screenshots/screen003.png
--- a/screenshots/screen004.png
+++ b/screenshots/screen004.png
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,25 @@
+echo "downloading the dictionary..."
+curl https://blog.xjpvictor.info/wp-content/uploads/汉典.prc > zdic.prc
+
+cd tools
+
+echo "downloading dependencies..."
+git clone https://github.com/kroo/mobi-python
+cp -r mobi-python/mobi ./mobi
+rm -rf mobi-python
+
+echo "patching bugs in dependencies..."
+chmod +x patch.py
+./patch.py
+
+echo "converting dictionary to txt... (this might take a while ~10 mins)"
+chmod +x to_txt.py
+./to_txt.py > ../zdic.txt
+
+echo "converting txt to json..."
+mkdir ../zdic_json
+chmod +x to_json.py
+./to_json.py
+
+echo "done setting up!"
+echo "to use the app, either run 'pkg .' to package into a binary, or run 'node index.js' directly."
--- a/tools/patch.py
+++ b/tools/patch.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python2.7
+t = open("mobi/__init__.py",'r').read().replace(
+	"uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])",
+	"result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+2]['record Data Offset']-self.config['mobi']['extra bytes']])"
+).replace(
+	"for record in range(1, self.config['mobi']['First Non-book index'] - 1):",
+	"for record in range(0, 10000000,1):"
+)
+open("mobi/__init__.py",'w').write(t)
+t = open("mobi/lz77.py",'r').read().replace(
+	"print(\"WARNING:","#print(\"WARNING:"
+).replace(
+	"\" beginning of text!",
+	"#"
+)
+open("mobi/lz77.py",'w').write(t)
--- a/tools/to_json.py
+++ b/tools/to_json.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+import os
+import re
+import io
+import sys 
+import json 
+
+reload(sys)  
+sys.setdefaultencoding('utf8')
+
+txt = open("../zdic.txt",'r').read()
+ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
+ent = [(x[0].strip(),x[1]) for x in ent]
+ent = [x for x in ent if x[0] != ""]
+
+def first(l):
+    return "" if len(l) == 0 else l[0]
+
+def rem_bad_char(t):
+    return t.decode('utf-8','ignore')
+
+def heads_to_name(t):
+    u = [x for x in t if len(hex(ord(x)))-2 <= 4]
+    u = ("".join(u)).decode("utf-8","ignore")
+    #print u
+    return u
+
+def append_if_ok(heads,word):
+    try:
+        fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
+        open(fn,'wb').write("test.")
+        os.remove(fn)
+        heads.append(unicode(word)[0].lower())
+        return True
+    except:
+        print("BAD WORD NAME:",word)
+        return False
+
+heads = []
+result = {}
+weirdo = {}
+for e in ent:
+    
+    word = e[0]
+    content = e[1]
+    print word,
+    
+    is_ok = True
+
+    if len(heads) == 0:
+        heads = []
+        is_ok = append_if_ok(heads,word)
+
+    else:
+        try:
+            b = not (word.lower()).startswith(heads[-1].lower())
+        except:
+            continue
+        if b:
+            if len(result) > 1000 or len(heads) >= 128:
+                fn = heads_to_name(heads)
+                open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
+
+                result = {}
+                heads = []
+                is_ok = append_if_ok(heads,word)
+            else:
+                is_ok = append_if_ok(heads,word)
+
+
+    traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
+    pinyin = first(re.findall(r"拼音.*?：(.*?)[<\t　]",content))
+    zhuyin = first(re.findall(r"注音.*?：(.*?)<",content))
+    definition = re.findall(r"<li.*?>(.*?)</li>",content)
+    if len(definition) == 0:
+        definition = [content]
+
+    definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
+    definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
+    definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
+    #definition = [re.sub(r"【解释】：","",d).strip() for d in definition]
+    traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
+
+    definition = [rem_bad_char(d) for d in definition 
+        if  (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
+        and (len(zhuyin) == 0 or zhuyin not in d ) \
+        and "繁体" not in d
+        and "简体" not in d
+        and "郑码" not in d
+        and "拼音" not in d
+        and "粤语：" not in d
+        and "潮州话：" not in d
+        and "UniCode" not in d
+        and "◎" not in d
+        and len(d.replace(word,"")) > 0
+    ]
+
+    if len(definition) == 0:
+        definition = [rem_bad_char(content)]
+        definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
+
+
+    thing = {"TRD":traditional.decode('utf-8',"ignore"), 
+             "PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
+             "DEF":definition,
+            }
+    if is_ok:
+        result[word]=thing
+    else:
+        weirdo[word]=thing
+
+open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))
+
--- a/tools/to_txt.py
+++ b/tools/to_txt.py
@@ -0,0 +1,8 @@
+#!/usr/bin/env python2.7
+from mobi import Mobi
+
+book = Mobi("../zdic.prc");
+book.parse();
+
+for record in book:
+	print record,