Files
bilibili-subtitle/src/utils/pinyinUtil.ts
IndieKKY 4b13230dc2 优化
2024-10-06 14:08:58 +08:00

126 lines
2.8 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pinyin from 'tiny-pinyin'
import {uniq} from 'lodash-es'
/**
* pinyin的返回结果
*/
interface Ret {
type: 1 | 2 | 3
source: string
target: string
}
interface Phase {
pinyin: boolean
list: Ret[]
}
/**
* 获取Phase列表(中英文分离列表)
*/
export const getPhases = (str: string) => {
const rets = pinyin.parse(str)
const phases: Phase[] = []
let curPinyin_ = false
let curPhase_: Ret[] = []
const addCurrentPhase = () => {
if (curPhase_.length > 0) {
phases.push({
pinyin: curPinyin_,
list: curPhase_,
})
}
}
// 遍历rets
for (const ret of rets) {
const newPinyin = ret.type === 2
// 如果跟旧的pinyin类型不同先保存旧的
if (newPinyin !== curPinyin_) {
addCurrentPhase()
// 重置
curPinyin_ = newPinyin
curPhase_ = []
}
// 添加新的
curPhase_.push(ret)
}
// 最后一个
addCurrentPhase()
return phases
}
/**
* 获取原子字符列表,如 tool tab 汉 字
*/
export const getAtoms = (str: string) => {
const phases = getPhases(str)
const atoms = []
for (const phase of phases) {
if (phase.pinyin) { // all words
atoms.push(...phase.list.map(e => e.source).filter(e => e))
} else { // split
atoms.push(...(phase.list.map((e: any) => e.source).join('').match(/\w+/g)??[]).filter((e: string) => e))
}
}
return atoms
}
const fixStrs = (atoms: string[]) => {
// 小写
atoms = atoms.map(e => e.toLowerCase())
// 去重
atoms = uniq(atoms)
// 返回
return atoms
}
export const getWords = (str: string) => {
// 获取全部原子字符
const atoms = getAtoms(str)
// fix
return fixStrs(atoms)
}
/**
* 我的世界Minecraft => ['wodeshijie', 'deshijie', 'shijie', 'jie'] + ['wdsj', 'dsj', 'sj', 'j']
*
* 1. only handle pinyin, other is ignored
*/
export const getWordsPinyin = (str: string) => {
let result: string[] = []
for (const phase of getPhases(str)) {
// only handle pinyin
if (phase.pinyin) { // 我的世界
// 获取全部原子字符
// 我的世界 => [我, 的, 世, 界]
const atoms: string[] = []
atoms.push(...phase.list.map(e => e.source).filter(e => e))
// 获取全部子串
// [我, 的, 世, 界] => [我的世界, 的世界, 世界, 界]
const allSubStr = []
for (let i = 0; i < atoms.length; i++) {
allSubStr.push(atoms.slice(i).join(''))
}
// pinyin version
const pinyinList = allSubStr.map((e: string) => pinyin.convertToPinyin(e))
result.push(...pinyinList)
// pinyin first version
const pinyinFirstList = allSubStr.map((e: string) => pinyin.parse(e).map((e: any) => e.type === 2?e.target[0]:null).filter(e => !!e).join(''))
result.push(...pinyinFirstList)
}
}
// fix
result = fixStrs(result)
return result
}