You've already forked bilibili-subtitle
126 lines
2.8 KiB
TypeScript
126 lines
2.8 KiB
TypeScript
import pinyin from 'tiny-pinyin'
|
||
import {uniq} from 'lodash-es'
|
||
|
||
/**
|
||
* pinyin的返回结果
|
||
*/
|
||
interface Ret {
|
||
type: 1 | 2 | 3
|
||
source: string
|
||
target: string
|
||
}
|
||
|
||
interface Phase {
|
||
pinyin: boolean
|
||
list: Ret[]
|
||
}
|
||
|
||
/**
|
||
* 获取Phase列表(中英文分离列表)
|
||
*/
|
||
export const getPhases = (str: string) => {
|
||
const rets = pinyin.parse(str)
|
||
|
||
const phases: Phase[] = []
|
||
let curPinyin_ = false
|
||
let curPhase_: Ret[] = []
|
||
const addCurrentPhase = () => {
|
||
if (curPhase_.length > 0) {
|
||
phases.push({
|
||
pinyin: curPinyin_,
|
||
list: curPhase_,
|
||
})
|
||
}
|
||
}
|
||
|
||
// 遍历rets
|
||
for (const ret of rets) {
|
||
const newPinyin = ret.type === 2
|
||
// 如果跟旧的pinyin类型不同,先保存旧的
|
||
if (newPinyin !== curPinyin_) {
|
||
addCurrentPhase()
|
||
// 重置
|
||
curPinyin_ = newPinyin
|
||
curPhase_ = []
|
||
}
|
||
// 添加新的
|
||
curPhase_.push(ret)
|
||
}
|
||
// 最后一个
|
||
addCurrentPhase()
|
||
|
||
return phases
|
||
}
|
||
|
||
/**
|
||
* 获取原子字符列表,如 tool tab 汉 字
|
||
*/
|
||
export const getAtoms = (str: string) => {
|
||
const phases = getPhases(str)
|
||
|
||
const atoms = []
|
||
for (const phase of phases) {
|
||
if (phase.pinyin) { // all words
|
||
atoms.push(...phase.list.map(e => e.source).filter(e => e))
|
||
} else { // split
|
||
atoms.push(...(phase.list.map((e: any) => e.source).join('').match(/\w+/g)??[]).filter((e: string) => e))
|
||
}
|
||
}
|
||
|
||
return atoms
|
||
}
|
||
|
||
const fixStrs = (atoms: string[]) => {
|
||
// 小写
|
||
atoms = atoms.map(e => e.toLowerCase())
|
||
|
||
// 去重
|
||
atoms = uniq(atoms)
|
||
|
||
// 返回
|
||
return atoms
|
||
}
|
||
|
||
export const getWords = (str: string) => {
|
||
// 获取全部原子字符
|
||
const atoms = getAtoms(str)
|
||
// fix
|
||
return fixStrs(atoms)
|
||
}
|
||
|
||
/**
|
||
* 我的世界Minecraft => ['wodeshijie', 'deshijie', 'shijie', 'jie'] + ['wdsj', 'dsj', 'sj', 'j']
|
||
*
|
||
* 1. only handle pinyin, other is ignored
|
||
*/
|
||
export const getWordsPinyin = (str: string) => {
|
||
let result: string[] = []
|
||
|
||
for (const phase of getPhases(str)) {
|
||
// only handle pinyin
|
||
if (phase.pinyin) { // 我的世界
|
||
// 获取全部原子字符
|
||
// 我的世界 => [我, 的, 世, 界]
|
||
const atoms: string[] = []
|
||
atoms.push(...phase.list.map(e => e.source).filter(e => e))
|
||
// 获取全部子串
|
||
// [我, 的, 世, 界] => [我的世界, 的世界, 世界, 界]
|
||
const allSubStr = []
|
||
for (let i = 0; i < atoms.length; i++) {
|
||
allSubStr.push(atoms.slice(i).join(''))
|
||
}
|
||
// pinyin version
|
||
const pinyinList = allSubStr.map((e: string) => pinyin.convertToPinyin(e))
|
||
result.push(...pinyinList)
|
||
// pinyin first version
|
||
const pinyinFirstList = allSubStr.map((e: string) => pinyin.parse(e).map((e: any) => e.type === 2?e.target[0]:null).filter(e => !!e).join(''))
|
||
result.push(...pinyinFirstList)
|
||
}
|
||
}
|
||
|
||
// fix
|
||
result = fixStrs(result)
|
||
|
||
return result
|
||
}
|