initial commit

2020-08-06 13:32:21 -04:00
commit 86eb38d053
13 changed files with 524 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
 .DS_Store
 */DS_Store
 zdic.prc
 zdic.txt
 zdic_json/*
 tools/mobi/*
 zdic-cli-*
--- a/README.md
+++ b/README.md
@@ -0,0 +1,88 @@
 # zdic-cli
 An alternative, offline, regex-supporting, command-line interface to [zdic (漢典)](zdic.net), featuring:
 - No internet connection required
 - Full text search with regex: a command for finding characters/phrases in body text of definitions.
 - Colorful text for highlighting entries
 ![](screenshots/screen000.png)
 ![](screenshots/screen001.png)
 ![](screenshots/screen002.png)
 ![](screenshots/screen003.png)
 ![](screenshots/screen004.png)
 ## Downloads
 Standalone version available for windows, mac and linux. Please refer to the Releases page.
 ## Usage
 zdic-cli runs as a REPL loop, and there are 5 types of commands you can enter:
 ```
 def 甲        display entry for 甲
 pre 甲        list words that starts with 甲
 has 甲 乙 ... list words that contains 甲 and 乙 ...
 txt 甲        list words whose full entry text contains 甲 (regex supported)
 sel n         display entry at previously returned list index n
 ```
 `def` is probably the most common one for simply looking up characters/words. `pre` `has` `txt` will return an enumerated list listing all elligible entries, and `sel 0`/`sel 1`/`sel n` can be used to select from the list.
 ## Development setup
 **This section is for compiling from source (and is somewhat complex), if you just would like to use the software, please check out the Releases page.**
 ### Dependencies
 - node.js/npm
 - python2. Tested on 2.7. It has to be python 2 instead of 3 because much of the data-processing work were done way back in a time when 2 was the norm. Sorry folks, but if you're a python3 purist feel free to send a PR!
 - pkg (optional, to build standalone binaries for multiple platforms) `npm install -g pkg`
 ### Downloading and compiling the dictionary files
 This repo does not contain the dictonary files themselves as they're too large. Instead, a shell script (`setup.sh`) is provided to automate the process:
 - Automatically download the original Kindle dictionary format (.PRC) from the internet. The download link hardcoded in `setup.sh` might fail in the future, in which case a google search for `汉典.prc` should yield alternative resources.
 - Automatically download a python library (kroo/mobi-python) for parsing mobi files. A rather old library and have some oddities, a find-and-replace script will be automatically run to patch some glitches in the source code :P
 - `python/to_txt.py` is run to extract a raw `txt` file from the kindle format `prc`.
 - `phthon/to_json.py` is run to generate a directory of `json` files from the `txt` to make lookup and formatting more efficient.
 Run the shell script with:
 ```
 sh setup.sh
 ```
 At this point you'll have `zdic.prc`, `zdic.txt` and directory `zdic_json/`. Only `zdic_json` is needed, so if everything went well with the script, you can freely delete the other two and gain some 700MB of free space :)
 ### Compiling the binary
 At this point you can also run the software by simply doing:
 ```
 node index.js
 ```
 You can also package it into a binary using
 ```
 pkg .
 ```
 provided you have the node and pkg dependencies installed.
 Tip: to run the software by typing `zdic` anywhere, you can symlink it to `/user/bin`, e.g.
 ```
 ln -s /user/bin/zdic path/to/zdic-cli/zdic-cli
 ```
--- a/index.js
+++ b/index.js
@@ -0,0 +1,259 @@
 const fs = require("fs")
 const readline = require('readline');
 const dict_path = __dirname+'/zdic_json'
 function locate_file(word){
  const folder = dict_path;
  var files = fs.readdirSync(folder)
  for (var i = 0; i < files.length; i++){
    if (files[i].includes(word.slice(0,1))){
      return files[i]
    }
  }
  return "extended.json"
 }
 function read_json(file_path){
  return JSON.parse(fs.readFileSync(dict_path+"/"+file_path).toString())
 }
 function define(word){
  var dict_path = locate_file(word)
  var dict = read_json(dict_path)
  var ret = []
  if (word in dict){
    return [word,dict[word]]
  }
  return null;
 }
 function starts_with(word){
  var ret = []
  var dict = read_json(locate_file(word))
  for (w in dict){
    if (w.startsWith(word)){
      ret.push(w)
    }
  }
  return ret;
 }
 function contains(words){
  // console.log(words)
  var ret = []
  var files = fs.readdirSync(dict_path)
  for (var i = 0; i < files.length; i++){
    if (!files[i].endsWith(".json")){
      continue;
    }
    var dict = read_json(files[i]);
    for (var w in dict){
      var ok = true;
      for (var j = 0; j < words.length; j++){
        if (!w.includes(words[j])){
          ok = false;
          break;
        }
      }
      if (ok){
        ret.push(w);
      }
    }
  }
  return ret;
 }
 function full_text_search(word,view_len,callback){
  var ret = []
  var pd = Math.floor((view_len-word.length)/2);
  var files = fs.readdirSync(dict_path)
  var re = new RegExp(word);
  for (var i = 0; i < files.length; i++){
    if (!files[i].endsWith(".json")){
      continue;
    }
    var dict = read_json(files[i]);
    for (var w in dict){
      for (var j = 0; j < dict[w]['DEF'].length; j++){
        var idx = dict[w]['DEF'][j].search(re)
        if (idx == -1){
          continue;
        }
        var it = [ret.length,w,dict[w]['DEF'][j].slice(Math.max(idx-pd,0),idx+word.length+pd)];
        ret.push(w);
        callback(it);
        break;
      }
    }
  }
  return ret;
 }
 function render_def(word,entry){
  var result = ""
  result += "\x1b[32m\033[1m"+word+"\x1b[0m"
  if (entry['TRD'] != ''){
    result += " \x1b[32m\033[1m("+entry['TRD']+")\x1b[0m "
  }
  if (entry['PRN'][0] != ''){
    result += " \x1b[33m[ "+entry['PRN'][0].trim()+""
  }
  if (entry['PRN'][1] != ''){
    result += " , "+entry['PRN'][1].trim()
  }
  if (entry['PRN'][0] != ''){
    result += " ]\x1b[0m"
  }
  result += "\n"
  var n = process.stdout.columns-4;
  for (var i = 0; i < entry['DEF'].length; i++){
    result += "\x1b[2m〇\x1b[0m"
    var j = 1;
    var t = entry['DEF'][i];
    var isf = true;
    for (var c of t){
      if (c == "《"){
        result += "\x1b[31m"
      }
      if (c == "～"){
        result += "\x1b[33m"+c+"\x1b[0m";
        if (!isf){
          result += "\x1b[2m"
        }
      }else{
        result += c;
      }
      if (c == "》"){
        result += "\x1b[0m"
        if (!isf){
          result += "\x1b[2m"
        }
      }
      if (c == "。" && isf){
        result += "\x1b[2m"
        isf = false;
      }
      j+=2;
      if (j >= n){
        j = 2;
        result += "\n　　"
      }
    }
    result+="\x1b[0m\n"
  }
  result += ""
  return result
 }
 function char_pad(x,n){
  return (x+"　".repeat(n)).slice(0,n);
 }
 function render_item(i,x,n){
  return `\x1b[33m${i.toString().padStart(4)}\x1b[0m ${char_pad(x,n)}`;
 }
 function render_list(lst){
  var npi = 6;
  var npl = Math.floor((process.stdout.columns-2)/(npi*2+5))
  var result = "";
  for (var i = 0; i < lst.length; i++){
    var li = lst[i];
    li = li.replace(/[^一-鿿]/g,"")
    if (!li.length){//oops
      li = lst[i];
    }
    result += render_item(i,li,npi);
    if (i % npl == npl-1){
      result += "\n"
    }else{
      result += ""
    }
  }
  return result;
 }
 var commands = {
  "def":["def x      ","display entry for x"],
  "pre":["pre x      ","list words that starts with x"],
  "has":["has x y ...","list words that contains x and y ..."],
  "txt":["txt x      ","list words whose full entry text contains x (regex supported)"],
  "sel":["sel n      ","display entry at previously returned list index n"],
 }
 function main(prev){
  var curr = null;
  var def = null;
  var fail = false;
  const rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout
  });
  rl.question('> ', (answer) => {
    try{
      answer = answer.trim();
      var cmd = answer.slice(0,3);
      var arg = answer.slice(4);
      if (!arg.length){
        console.log(`\x1b[36m${commands[cmd][0]} \x1b[0m${commands[cmd][1]}`);
      }else{
        if (cmd == "def"){
          var def = define(arg);
          if (def){
            console.log(render_def(...def));
          }
        }else if (cmd == "pre"){
          curr = starts_with(arg);
          console.log(render_list(curr));
        }else if (cmd == "has"){
          curr = contains(arg.split(" "));
          console.log(render_list(curr));
        }else if (cmd == "txt"){
          curr = full_text_search(arg,Math.floor((process.stdout.columns-20)/2),function(x){
            console.log(`${render_item(x[0],x[1],6)}\x1b[2m${x[2]}\x1b[0m`);
          });
        }else if (cmd == "sel"){
          var def = define(prev[parseInt(arg)]);
          if (def){
            console.log(render_def(...def))
          }
        }else{
          console.log("\x1b[31munsupported command.\x1b[0m")
          fail = true
        }
        if (!fail && (def == null && (curr == null || curr.length == 0))){
          console.log("\x1b[2m(0 result returned)\x1b[0m")
        }
      }
    }catch(e){
      console.log("\x1b[31mcommand parse failed. trying as direct query...\x1b[0m")
      // console.log(e)
      try{
        var def = define(answer);
        if (def){
          console.log(render_def(...def));
        }else{
          throw new Error();
        }
      }catch(ee){
        console.log("\x1b[31mcommand parse totally failed.\x1b[0m")
      }
    }
    rl.close()
    main(curr||prev);
  });
 }
 console.log("╔═════════════════════════════════════════════╗")
 console.log("║\x1b[31m               漢    典   CLI                \x1b[0m║")
 console.log("║Unoffical offline 漢典 (zdic.net) commandline║")
 console.log("║\x1b[2m w/ data derived from `汉典.prc` (for Kindle)\x1b[0m║")
 console.log("║\x1b[2m            Lingdong Huang 2020              \x1b[0m║");
 console.log("╚═════════════════════════════════════════════╝")
 console.log(`commands: ${Object.keys(commands).map(x=>( "\x1b[36m"+x+"\x1b[0m" )).join(",") }, run without arguments to see help`);
 main();
--- a/package.json
+++ b/package.json
@@ -0,0 +1,7 @@
 {
 	"name":"zdic-cli",
 	"pkg": {
 		"assets": "zdic_json/*"
 	},
 	"bin":"index.js"
 }
--- a/screenshots/screen000.png
+++ b/screenshots/screen000.png
--- a/screenshots/screen001.png
+++ b/screenshots/screen001.png
--- a/screenshots/screen002.png
+++ b/screenshots/screen002.png
--- a/screenshots/screen003.png
+++ b/screenshots/screen003.png
--- a/screenshots/screen004.png
+++ b/screenshots/screen004.png
--- a/setup.sh
+++ b/setup.sh
@@ -0,0 +1,25 @@
 echo "downloading the dictionary..."
 curl https://blog.xjpvictor.info/wp-content/uploads/汉典.prc > zdic.prc
 cd tools
 echo "downloading dependencies..."
 git clone https://github.com/kroo/mobi-python
 cp -r mobi-python/mobi ./mobi
 rm -rf mobi-python
 echo "patching bugs in dependencies..."
 chmod +x patch.py
 ./patch.py
 echo "converting dictionary to txt... (this might take a while ~10 mins)"
 chmod +x to_txt.py
 ./to_txt.py > ../zdic.txt
 echo "converting txt to json..."
 mkdir ../zdic_json
 chmod +x to_json.py
 ./to_json.py
 echo "done setting up!"
 echo "to use the app, either run 'pkg .' to package into a binary, or run 'node index.js' directly."
--- a/tools/patch.py
+++ b/tools/patch.py
@@ -0,0 +1,16 @@
 #!/usr/bin/env python2.7
 t = open("mobi/__init__.py",'r').read().replace(
 	"uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+1]['record Data Offset']-self.config['mobi']['extra bytes']])",
 	"result = uncompress_lz77(self.contents[self.records[recordnum]['record Data Offset']:self.records[recordnum+2]['record Data Offset']-self.config['mobi']['extra bytes']])"
 ).replace(
 	"for record in range(1, self.config['mobi']['First Non-book index'] - 1):",
 	"for record in range(0, 10000000,1):"
 )
 open("mobi/__init__.py",'w').write(t)
 t = open("mobi/lz77.py",'r').read().replace(
 	"print(\"WARNING:","#print(\"WARNING:"
 ).replace(
 	"\" beginning of text!",
 	"#"
 )
 open("mobi/lz77.py",'w').write(t)
--- a/tools/to_json.py
+++ b/tools/to_json.py
@@ -0,0 +1,114 @@
 #!/usr/bin/env python2.7
 # -*- coding: utf-8 -*-
 import os
 import re
 import io
 import sys 
 import json 
 reload(sys)  
 sys.setdefaultencoding('utf8')
 txt = open("../zdic.txt",'r').read()
 ent = re.findall(r'<h2>([^<>]*?)</h2>(.*?)<mbp:pagebreak/>', txt)
 ent = [(x[0].strip(),x[1]) for x in ent]
 ent = [x for x in ent if x[0] != ""]
 def first(l):
    return "" if len(l) == 0 else l[0]
 def rem_bad_char(t):
    return t.decode('utf-8','ignore')
 def heads_to_name(t):
    u = [x for x in t if len(hex(ord(x)))-2 <= 4]
    u = ("".join(u)).decode("utf-8","ignore")
    #print u
    return u
 def append_if_ok(heads,word):
    try:
        fn = "../zdic_json/"+heads_to_name(heads+[unicode(word)[0].lower()])+".json"
        open(fn,'wb').write("test.")
        os.remove(fn)
        heads.append(unicode(word)[0].lower())
        return True
    except:
        print("BAD WORD NAME:",word)
        return False
 heads = []
 result = {}
 weirdo = {}
 for e in ent:
    word = e[0]
    content = e[1]
    print word,
    is_ok = True
    if len(heads) == 0:
        heads = []
        is_ok = append_if_ok(heads,word)
    else:
        try:
            b = not (word.lower()).startswith(heads[-1].lower())
        except:
            continue
        if b:
            if len(result) > 1000 or len(heads) >= 128:
                fn = heads_to_name(heads)
                open("../zdic_json/"+fn+".json",'wb').write(json.dumps(result))
                result = {}
                heads = []
                is_ok = append_if_ok(heads,word)
            else:
                is_ok = append_if_ok(heads,word)
    traditional = first(re.findall(r"#444\">\((.*?)\)",content)) + first(re.findall(r"繁体字:(.*?)</li>",content))
    pinyin = first(re.findall(r"拼音.*?：(.*?)[<\t　]",content))
    zhuyin = first(re.findall(r"注音.*?：(.*?)<",content))
    definition = re.findall(r"<li.*?>(.*?)</li>",content)
    if len(definition) == 0:
        definition = [content]
    definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
    definition = [re.sub(r"^.{0,1}\d.*?[\.\)]","",d).strip() for d in definition]
    definition = [re.sub(r"===汉英互译===","",d).strip() for d in definition]
    #definition = [re.sub(r"【解释】：","",d).strip() for d in definition]
    traditional = re.sub(r"<.*?>","",traditional).replace(" ","")
    definition = [rem_bad_char(d) for d in definition 
        if  (len(pinyin) == 0 or (pinyin not in d and pinyin.replace(" ","") not in d)) \
        and (len(zhuyin) == 0 or zhuyin not in d ) \
        and "繁体" not in d
        and "简体" not in d
        and "郑码" not in d
        and "拼音" not in d
        and "粤语：" not in d
        and "潮州话：" not in d
        and "UniCode" not in d
        and "◎" not in d
        and len(d.replace(word,"")) > 0
    ]
    if len(definition) == 0:
        definition = [rem_bad_char(content)]
        definition = [re.sub(r"<.*?>","",d).strip() for d in definition]
    thing = {"TRD":traditional.decode('utf-8',"ignore"), 
             "PRN":[pinyin.decode('utf-8',"ignore"), zhuyin.decode('utf-8',"ignore")],
             "DEF":definition,
            }
    if is_ok:
        result[word]=thing
    else:
        weirdo[word]=thing
 open("../zdic_json/extended.json",'wb').write(json.dumps(weirdo))
--- a/tools/to_txt.py
+++ b/tools/to_txt.py
@@ -0,0 +1,8 @@
 #!/usr/bin/env python2.7
 from mobi import Mobi
 book = Mobi("../zdic.prc");
 book.parse();
 for record in book:
 	print record,