Utilisateur:Yopyop456/Brouillon/bct
Apparence
Cette page est un brouillon appartenant à Yopyop456
Conseils de rédaction
- → N'hésitez pas à publier sur le brouillon un texte inachevé et à le modifier autant que vous le souhaitez.
- → Pour enregistrer vos modifications au brouillon, il est nécessaire de cliquer sur le bouton bleu : « Publier les modifications ». Il n'y a pas d'enregistrement automatique.
Si votre but est de publier un nouvel article, votre brouillon doit respecter les points suivants :
- Respectez le droit d'auteur en créant un texte spécialement pour Wikipédia en français (pas de copier-coller venu d'ailleurs).
- Indiquez les éléments démontrant la notoriété du sujet (aide).
- Liez chaque fait présenté à une source de qualité (quelles sources – comment les insérer).
- Utilisez un ton neutre, qui ne soit ni orienté ni publicitaire (aide).
- Veillez également à structurer votre article, de manière à ce qu'il soit conforme aux autres pages de l'encyclopédie (structurer – mettre en page).
- → Si ces points sont respectés, pour transformer votre brouillon en article, utilisez le bouton « publier le brouillon » en haut à droite. Votre brouillon sera alors transféré dans l'espace encyclopédique.
// wojmepwam27+
window.dict = {}
dict._path = '/tmp2/cnchar2/scripts_and_data/'
function get(x, val){
// let idx = this.key2[x] || this.key.indexOf(x)
let idx = this.key.indexOf(x)
if(idx < 0 && x?.includes('儿')){
return get.call(this, x.replace('儿', ''), val)
}
if(val) {
if(typeof val == 'object') Object.assign(this.val[idx], val)
else this.val[idx] = val
}
else return this.val[idx]
}
function get_si_tr(si){
let tr = ''
entry = dict._cedict[si]
if(entry){
tr = entry[0]
}
else {
si.split('').forEach(a=>tr+=dict._cedict[a]?.at(0)||'')
}
return tr
}
function get_cc_def(si, lng = 'e'){
let def, i, out
def = dict['_c'+lng+'dict'][si]
if(!def) return ''
else def = def[3].split('/')
def = def.slice(1, -1)
out = []
for(i=0; i<def.length; i++){
if(def[i].includes(';')) {
out.push(def[i])
break
}
if(def[i].match(/[A-Z\[]/)) continue
out.push(def[i])
}
if(!out.length) out.push(def[0])
return out.slice(0, 3).join('; ')
}
function get_zdic_ci(si){
let out, more, i
more = dict._zdic[si[0]].more
more = more.replaceAll(/\s+\n/g, '\n').split('\n')
out = []
i = more.indexOf(si)
if(i<0) return
out.push(more[i], more[i+1])
for(i+=2; i<more.length; i++){
if(!(more[i][0] == '[' || more[i][0] == '〖') && more[i].match(/[a-z]/)) {
out.pop()
break
}
else if(more[i] == ''){
break
}
out.push(more[i])
}
return out
}
function get_decomp(si){
si = si.split('')
let i, tmp, out = {decomp: [], phon: [], up:[]}
si.forEach(char=>{
tmp = dict._decomp[char].decomposition
if(dict._decomp[char].etymology?.phonetic) tmp = tmp.replace(dict._decomp[char].etymology.phonetic, dict._decomp[char].etymology.phonetic + 'p')
if(dict._decomp[char].radical) tmp = tmp.replace(dict._decomp[char].radical, dict._decomp[char].radical + 'r')
if(char == dict._decomp[char].radical) tmp = 'r' + tmp
out.decomp.push(tmp)
})
out.decomp.forEach(char=>{
out.phon.push([])
if(char.includes('p')){
tmp = char.match(/(.)p/)[1]
for(i in dict._decomp){
if(dict._decomp[i].etymology?.phonetic == tmp){
out.phon.at(-1).push(dict._decomp[i].character)
}
}
}
})
si.forEach(char=>{
out.up.push([])
for(i in dict._decomp){
if(dict._decomp[i].decomposition.includes(char)){
out.up.at(-1).push(dict._decomp[i].character)
}
}
})
return out
}
function get_level(si){
let c, entry, le0, le1, n
le0 = le1 = n = 0
for(c of si){
entry = dict.get(c)
if(!entry) continue
c = entry.le.match(/char-(\d)/)?.at(1)
if(le1 < c){
le0 = le1
le1 = c
n = 1
}
else n++
}
return +le1
}
async function all_load(){
await load_ccdict('cfdict/cfdict.u8')
await load_ccdict('cedict/cedict_1_0_ts_utf-8_mdbg.txt')
await load_hsk3off()
await load_hsk3elk()
await load_hsk3yar()
await check_hsk3off_elk()
// await check_hsk3off_elk_zi()
await load_hsk2()
await load_hsk2uni()
//await loadhsk3def()
await load_gtrans()
await load_xiandai()
await load_zdic()
await load_decomp()
await load_hydcd()
await load_coloc()
await load_bct()
await load_anki()
await load_wiktio()
// await merge_hsk()
await new_merge_hsk()
// await load_wfreq()
// await merge_hsk2()
// return await mergeHSK()
}
async function load_ccdict(s){
let txt, out
out = {}
txt = await fetch(dict._path + s).then(x=>x.text())
txt = txt.replaceAll('\r', '').split('\n')
txt.forEach((a, b)=>{
let w, x, y, z
x = a.indexOf(' ')
y = a.indexOf(' ', x+1)
z = a.indexOf(']', y+1)+1
w = a.slice(x+1, y)
if(!out[w]) out[w] = [a.slice(0, x), a.slice(x+1, y), a.slice(y+1, z), a.slice(z+1)]
else {
out[w][2] += ', ' + a.slice(y+1, z)
out[w][3] += a.slice(z+2)
}
})
dict['_'+s.slice(0, 6)] = out
}
async function load_hsk3off(){
let txt
txt = await fetch(dict._path + 'shawkynasr_HSK-official-Query-System/词汇 2022.csv').then(x=>x.text())
txt = txt.trim().replaceAll('\r','').split('\n')
txt.shift()
txt.forEach((a, b)=>{
txt[b]=a.split(',')
txt[b].key = txt[b][2]
txt[b][2] = txt[b][2]
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
txt[b][3] = txt[b][3].replaceAll('∥', '')
})
dict._hsk3off = txt
txt = await fetch(dict._path + 'shawkynasr_HSK-official-Query-System/汉字.csv').then(x=>x.text())
txt = txt.trim().replaceAll('\r','').split('\n')
txt.shift()
txt.forEach((a, b)=>txt[b]=a.split(','))
dict._hsk3off_zi = txt
}
async function load_hsk3elk(){
let txt, line, n, m, out, ww, char1, char2
dict._hsk3elk = []
txt = await fetch(dict._path + 'elkmovie_hsk30/wordlist.txt').then(x=>x.text())
txt = txt.split('\n')
m = 0
for(line of txt){
n = line.match(/^\d+/)
if(n == 1) m++
else if(!n) continue
char2 = line.replace(n + ' ', '')
char1 = char2
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
dict._hsk3elk.push({
simplified: char1,
simplified2: char2,
level : 'hsk-' + m
})
}
ww = ''
out = []
txt = await fetch(dict._path + 'elkmovie_hsk30/charlist.txt').then(x=>x.text())
txt = txt.split('\n')
m = 0
for(line of txt){
n = line.match(/^\d+/)
if(n == 1) m++
else if(!n) continue
if(m>7) {
ww += line.slice(-1)
continue
}
out.push({
simplified: line.replace(n + '\t', ''),
pinyin : '',
level: m,
})
}
for(line of out){
if(ww.includes(line.simplified)){
line.level += 'w'
}
}
dict._hsk3elk_zi = out
}
async function check_hsk3off_elk(){
dict._hsk3off.sort((a,b)=>{
let lvl = '一二三四五六高'
if(lvl.indexOf(a[1][0]) > lvl.indexOf(b[1][0])){
return 1
}
else if(lvl.indexOf(a[1][0]) < lvl.indexOf(b[1][0])){
return -1
}
else return 0
})
dict._hsk3off_zi.sort((a,b)=>{
let lvl = '一二三四五六高'
if(lvl.indexOf(a[1][0]) > lvl.indexOf(b[1][0])){
return 1
}
else if(lvl.indexOf(a[1][0]) < lvl.indexOf(b[1][0])){
return -1
}
else return 0
})
let key1 = []
let key2 = []
let n = dict._hsk3elk.length
for(let i=0; i<n; i++){
key1.push(dict._hsk3elk[i].simplified)
key2.push(dict._hsk3off[i][2])
}
for(let i=0; i<n; i++){
let j = key1.indexOf(key2[i])
if(j >= 0){
key1[j] = ''
key2[i] = ''
dict._hsk3off[i].push(j) // index 5
}
else{
console.warn(key2[i])
}
}
dict._hsk3off.sort((a,b)=>{
if(a[5] > b[5]){
return 1
}
if(a[5] < b[5]){
return -1
}
else return 0
})
let tmp = [...dict._hsk3elk]
for(let i=0; i<dict._hsk3off.length; i++){
if(dict._hsk3off[i][2].includes('儿')){
if(tmp[i].simplified.includes('儿')){
if(dict._hsk3off[i][3].includes('ér')){
}
else {
tmp[i].simplified = tmp[i].simplified.replace('儿', '')
}
}
else{
console.error(tmp[i])
}
}
}
}
async function check_hsk3off_elk_zi(){
let i, j, keys = ['', '', '', '', '', '', '', ''], keys2 = ['', '', '', '', '', '', '', '']
for(i of dict._hsk3elk_zi) {
keys[i.level] += i.simplified
}
for(i of dict.val) {
if(!i.le.includes('new-')) continue
keys2[i.le[4]] += i.si
}
for(i in keys2) {
keys2[i] = [...(new Set(keys2[i].split('')))].join('')
let re = new RegExp('['+keys2[i]+']', 'g')
for(j=+i+1; j<keys2.length; j++){
keys2[j] = keys2[j].replaceAll(re, '')
}
}
for(i in keys2){
for(j of keys2[i]){
if(!keys[i].includes(j)){
console.log(i, j)
}
}
}
// scrape = keys
// scrape2 = keys2
}
async function load_hsk2(){
let out = []
let txt = await fetch(dict._path + 'glxxyz_hskhsk.com/hskhsk.txt').then(x=>x.text())
txt = txt.split('\n')
let n = 0
for(i=0; i<txt.length; i++){
if(txt[i].includes('--HSK')) {
++n
continue
}
txt[i] = txt[i].split('\t')
let char2 = txt[i][0]
let char1 = char2
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
out.push({
level: 'hsk-' + n,
simplified: char1,
simplified2: char2,
definition_en: txt[i][4],
pinyin: txt[i][3],
})
}
dict._hsk2 = out
}
async function load_hsk3yar(){
let txt = await fetch(dict._path + 'cultureyard/cultureyard.json').then(x=>x.json())
for(let i in txt){
txt[i].simplified2 = txt[i].simplified
txt[i].simplified = txt[i].simplified
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
// .replace(/(.+)儿$/, '$1')
}
dict._hsk3yar = txt
}
async function load_hsk2uni(){
let txt = await fetch(dict._path + 'unige/unige.json').then(x=>x.json())
dict._hsk2uni = txt.value
for(let i in dict._hsk2uni){
dict._hsk2uni[i].simplified2 = dict._hsk2uni[i].simplified
if(dict._hsk2uni[i].simplified.includes('…')){
dict._hsk2uni[i].simplified = dict._hsk2uni[i].simplified.slice(0, 2)
continue
}
dict._hsk2uni[i].simplified = dict._hsk2uni[i].simplified
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
// .replace(/(.+)儿$/, '$1')
}
}
async function load_gtrans(){
let out, txt = await fetch(dict._path + 'gtranslate/gtranslate.txt').then(x=>x.text())
out = {}
txt = txt.replaceAll('\r', '').split('\n')
for(let i=0; i<txt.length; i++){
txt[i] = txt[i].split('\t')
out[txt[i][0]] = txt[i].slice(1)
}
dict._gtrans = out
}
async function load_xiandai(){
let i, out, txt
txt = await fetch(dict._path + 'CNMAN_XDHYCD7th/XDHYCD7th.txt').then(x=>x.text())
txt = txt.replaceAll('\r', '').split('\n')
out = {}
for(i=0; i<txt.length; i++){
txt[i] = txt[i].slice(1).split('】')
if(out[txt[i][0]]){
out[txt[i][0]] += '\n+ ' + txt[i][1]
}
else out[txt[i][0]] = '+ ' + txt[i][1]
}
dict._xiandai = out
}
async function load_zdic(){
let i, out, txt, ci
txt = await fetch(dict._path + 'pwxcoo_chinese-xinhua/data/ci.json').then(x=>x.json())
out = {}
for(i=0; i<txt.length; i++){
ci = txt[i].ci.replace(/\(.+\)/, '')
if(out[ci]){
// console.warn(ci)
out[ci] += '\n+ ' + txt[i].explanation
}
else out[ci] = '+ ' + txt[i].explanation
}
dict._zdic2 = out
txt = await fetch(dict._path + 'pwxcoo_chinese-xinhua/data/word.json').then(x=>x.json())
out = {}
for(i=0; i<txt.length; i++){
if(out[txt[i].word]){
// console.warn(txt[i].word)
out[txt[i].word].explanation += '\n--\n' + txt[i].pinyin + '\n' + txt[i].explanation
out[txt[i].word].more += '\n--\n' + txt[i].pinyin + '\n' + txt[i].explanation
}
else out[txt[i].word] = txt[i]
}
dict._zdic = out
}
async function load_decomp(){
let i, out, txt = await fetch(dict._path + 'skishore_makemeahanzi/dictionary.txt').then(x=>x.text())
txt = '[' + txt.trim().replaceAll('\n', ',\n') + ']'
txt = JSON.parse(txt)
out = {}
for(i=0; i<txt.length; i++) {
if(out[txt[i].character]) console.error(out[txt[i].character])
out[txt[i].character] = txt[i]
}
dict._decomp = out
}
async function load_hydcd(){
let i, out, txt
txt = await fetch(dict._path + 'lxs602_Chinese-Mandarin-Dictionaries/Hànyǔ Dà Cídiǎn - dāncí biǎo - word list.tab').then(x=>x.text())
txt = txt.replaceAll(/\t.+$/gm, '').replaceAll(/^.+\|/gm, '').split('\n')
dict._hydcd = txt
}
async function load_coloc(){
let i, out, txt
txt = await fetch(dict._path + 'final/coloc-final.json').then(x=>x.json())
dict._coloc = txt
}
async function load_bct(){
if(!dict._bct){
let bct = await fetch(dict._path + 'bct/BCT.txt').then(x=>x.text())
bct = bct.replaceAll('\r','').replaceAll(/"([^"]+?)\n([^"]+?)"/g, '$1 $2').replaceAll(/\t+/g, '\t').replaceAll('🎧', '出')
.trim().split('\n')
let out = []
for(let line of bct){
line = line.split('\t')
if(line[0].match(/\d/)) {
line[2] = line[2]
.replace(/\(.+\)/, '').replace(/<.+>/, '').replace(/[a-zA-Z]+/, '').replace(/[\/…].+/, '').replaceAll(' ','').replace('儿', '')
out.push(line)
}
}
bct = await fetch(dict._path + 'gtranslate/gtranslate_bct.txt').then(x=>x.text())
bct = bct.trim().replaceAll('\r', '').split('\n')
for(let line of bct){
line = line.split('\t')
out[+line[0]][1] = line[2]
// debugger
}
dict._bct = out
}
let out2 = []
let dict_entry, line
if(dict.val){
for(let i in dict._bct){
line = dict._bct[i]
dict_entry = dict.get(line[2])
if(!dict_entry || dict_entry.lev.startsWith('x-BCT')) out2.push(i)
}
}
return out2
}
async function load_anki(z=1){
let i, j, out, out2, txt, entry
if(!dict._anki1){
txt = await fetch(dict._path + 'anki/Taiwan_TBCL_wordlist_Traditional.csv').then(x=>x.text())
txt = txt.split('\r\n')
txt.shift()
dict._anki1 = txt
txt = await fetch(dict._path + 'anki/Taiwan_TOCFL_2023_wordlist_with_audio_Traditional.csv').then(x=>x.text())
txt = txt.split('\r\n')
txt.shift()
dict._anki2 = txt
return
}
out = {tra: [], sim: [], all: []}
out2 = {tra: [], sim: [], all: []}
for(i of dict._anki1){
i = i.split('�')
j = i[2].replace(/\(.+\)/, '').split('/')
entry = null
entry ??= dict.get(j[0])
entry ??= dict.get(j[1])
entry ??= dict.get(j[2])
if(entry && !entry.lev.startsWith('x-TW1')) continue
if(out.sim.includes(j[0])) continue
out.tra.push(i[1])
out.sim.push(j[0])
out.all.push(i)
}
for(i of dict._anki2){
i = i.split('�')
j = i[2].replace(/\(.+\)/, '').split('/')
entry = null
entry ??= dict.get(j[0])
entry ??= dict.get(j[1])
entry ??= dict.get(j[2])
if(entry && !entry.lev.startsWith('x-TW2')) continue
if(out.sim.includes(j[0])) continue
out2.tra.push(i[1])
out2.sim.push(j[0])
out2.all.push(i)
}
if(z == 1) return out
else return out2
}
async function load_wfreq(){
let i, j, out, txt, entry
txt = await fetch(dict._path + 'anki/loach_word_order.json').then(x=>x.json())
out = []
for(i of txt){
if(!dict.get(i) && i.length > 1 && i.charCodeAt(0) < 40000) out.push(i)
}
dict._wfreq = out
}
async function load_wiktio(x, online=1){
let i, j, out, txt, entry, char, char2
if(!x){
txt = await fetch(dict._path + 'gtranslate/wiktionary.json').then(x=>x.json())
dict._wiktio = txt
return
}
if(typeof scrape != 'object') scrape = {}
char = x
if(char in dict._wiktio){
char2 = dict._wiktio[char]
}
else {
if(!online) return
char2 = await fetch('https://en.wiktionary.org/api/rest_v1/page/definition/'+char).then(x=>x.text())
scrape[char] = char2
}
if(!char2.includes('No definition found') && !char2.includes('html')){
char2 = JSON.parse(char2)
if(char2.zh){
char2 = char2.zh[0].definitions.at(0).definition + (char2.zh[0].definitions.at(1) ? '; ' + char2.zh[0].definitions.at(1).definition : '')
entry = '' + char2.replaceAll(/<.+?>/g, '')
}
}
if(!entry) {
if(!(char in dict._wiktio)){
char2 = await fetch('https://en.wiktionary.org/api/rest_v1/page/mobile-html/'+char).then(x=>x.text())
scrape[char] = char2
}
char2 = char2.match(/\(“.+?”\)/g)
if(char2) entry = '' + char2[0].replaceAll(/<.+?>/g, '').slice(2,-2)
// else console.log(char)
}
return entry
}
/*
11437
out2 = [...(new Set(out))]
11441
for(i=0; i<out2.length; i++) if(!dict.key.includes(out2[i])) console.warn(out2[i])
*/
async function merge_hsk2(){
let char, char2, char3, i, entry, tmp
// GET ID SIMPLIFIED LEVEL ANKI
tmp = dict._anki[1]
for(i=0; i<tmp.length; i++){
char = tmp[i].replace(/\/.+/, '')
entry = dict.get(char)
if(entry) continue
char2 = get_cc_def(char)
char3 = get_cc_def(char, 'f')
dict.key.push(char)
dict.val.push({
id: dict.val.length,
si: char,
le: 'x-TW',
tr: get_si_tr(char),
pi: '',
po: '',
en: char2 ? '<ce>' + char2 : '',
fr: char3 ? '<cf>' + char3 : '',
})
}
// GET ID SIMPLIFIED LEVEL ANKI
tmp = dict._wfreq
for(i=0; i<tmp.length; i++){
char = tmp[i]
entry = dict.get(char)
if(entry) continue
char2 = get_cc_def(char)
char3 = get_cc_def(char, 'f')
dict.key.push(char)
dict.val.push({
id: dict.val.length,
si: char,
le: 'x-FQ',
tr: get_si_tr(char),
pi: '',
po: '',
en: char2 ? '<ce>' + char2 : '',
fr: char3 ? '<cf>' + char3 : '',
})
}
// ADD MISSING TRANSLATE EN
scrape = []
scrape2 = []
tmp = dict.val
for(i=0; i<tmp.length; i++){
if(!tmp[i].en) {
tmp[i].en = get_cc_def(tmp[i].si)
tmp[i].en = tmp[i].en ? '<ce>' + tmp[i].en : ''
if(tmp[i].en) console.warn(tmp[i].si)
}
if(!tmp[i].fr) {
tmp[i].fr = get_cc_def(tmp[i].si, 'f')
tmp[i].fr = tmp[i].fr ? '<cf>' + tmp[i].fr : ''
if(tmp[i].fr) console.warn(tmp[i].si)
}
if(!tmp[i].en && dict._gtrans[tmp[i].si]) tmp[i].en = '<go>' + dict._gtrans[tmp[i].si][1]
if(!tmp[i].fr && dict._gtrans[tmp[i].si]) tmp[i].fr = '<go>' + dict._gtrans[tmp[i].si][0]
if(!tmp[i].en) {
scrape.push([tmp[i].si, tmp[i].fr])
}
if(!tmp[i].fr) {
scrape2.push([tmp[i].si, tmp[i].en])
}
}
}
async function merge_hsk(){
dict.key = []
dict.key2 = {}
dict.val = []
dict.get = get.bind(dict)
let char, char2, i, entry, tmp
// GET ID SIMPLIFIED LEVEL HSK3
for(i=0; i<tmp.length; i++){
char = tmp[i].simplified
char = char
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
// .replace(/(.+)儿$/, '$1')
if(!dict.key.includes(char)){
dict.key.push(char)
dict.val.push({
id: dict.val.length,
si: char,
le: 'new-' + tmp[i].level.slice(-1)
})
}
}
// GET ID SIMPLIFIED LEVEL HSK3 ZI
tmp = dict._hsk3elk_zi
for(i=0; i<tmp.length; i++){
char = tmp[i].simplified
entry = dict.get(char)
if(!entry){
dict.key.push(char)
dict.val.push({
id: dict.val.length,
si: char,
le: 'char-' + tmp[i].level
})
}
else {
entry.le += ' | char-' + tmp[i].level
}
}
// GET ID SIMPLIFIED LEVEL HSK2
for(i=0; i<dict._hsk2.length; i++){
char = dict._hsk2[i].simplified
char = char
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
if(char.includes('儿')){
if(!dict.key.includes(char)){
char = char.replace(/(.+)儿$/, '$1')
}
if(!dict.key.includes(char)){
console.warn(char)
}
}
if(!dict.key.includes(char)){
dict.key.push(char)
dict.val.push({
id: dict.val.length,
si: char,
le: 'old-' + dict._hsk2[i].level.slice(-1)
})
}
else {
if(!dict.get(char).le.includes('old')){
dict.get(char).le += ' | old-' + dict._hsk2[i].level.slice(-1)
}
}
}
// GET ID SIMPLIFIED LEVEL BCT
tmp = await load_bct()
for(i=0; i<tmp.length; i++){
entry = dict._bct[+tmp[i]]
char2 = get_cc_def(entry[2])
dict.key.push(entry[2])
dict.val.push({
id: dict.val.length,
si: entry[2],
le: 'x-BCT',
tr: '',
pi: entry[1],
po: '',
en: char2 ? '<ce>' + char2 : '',
})
}
// GET TRADITIONAL
for(i=0; i<dict.val.length; i++){
dict.val[i].tr = get_si_tr(dict.val[i].si)
}
// GET PINYIN AND PART OF SPEECH HSK3
for(i=0; i<dict._hsk3off.length; i++){
char = dict._hsk3off[i][2]
char = char
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
// .replace(/(.+)儿$/, '$1')
entry = dict.get(char)
if(!entry) entry = dict.get(char.replace('儿', ''))
if(!entry.pi){
entry.pi = dict._hsk3off[i][3]
entry.po = dict._hsk3off[i][4]
}
else {
entry.pi += ' | ' + dict._hsk3off[i][3]
entry.po += ' | ' + dict._hsk3off[i][4]
}
}
// GET PINYIN ENGLISH HSK3 ZI
tmp = dict._hsk3off_zi
for(i=0; i<tmp.length; i++){
entry = dict.get(tmp[i][2])
if(!entry.le.startsWith('char')) continue
if(!entry.pi){
entry.pi = tmp[i][3]
entry.po = ''
entry.en = '<ce>' + get_cc_def(entry.si)
}
else {
entry.pi += ' | ' + tmp[i][3]
}
if(entry.en.includes('undefined')) debugger
}
// CLEAN UP PINYIN
tmp = dict.val
for(i=0; i<tmp.length; i++){
if(!tmp[i].pi) continue
if(tmp[i].pi.match(/\∣[^ ]+/)){
// baba|ba
tmp[i].pi = tmp[i].pi.replace(/\∣([^ ]+)/, '($1)')
}
else if(!tmp[i].pi.includes(' | ') && tmp[i].pi.match(/[^é]r$/)){
// ér hua
tmp[i].pi = tmp[i].pi.slice(0, -1) + ' | ' + tmp[i].pi
}
}
// GET ENGLISH HSK3
tmp = dict._hsk3yar
for(i=0; i<tmp.length; i++){
char = tmp[i].simplified
char = char
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
// .replace(/(.+)儿$/, '$1')
entry = dict.get(char)
if(!entry) entry = dict.get(char.replace('儿', ''))
if(!entry.en){
entry.en = '<yd>' + tmp[i].definition_en
}
else {
entry.en += ' | ' + tmp[i].definition_en
}
}
// GET ENGLISH PINYIN POS HSK2
tmp = dict._hsk2
for(i=0; i<tmp.length; i++){
char = tmp[i].simplified
char = char
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
// .replace(/(.+)儿$/, '$1')
entry = dict.get(char)
if(!entry) entry = dict.get(char.replace('儿', ''))
if(entry.le.includes('new')) continue
if(!entry.en){
entry.pi = tmp[i].pinyin
entry.po = ''
entry.en = '<py>' + tmp[i].definition_en
}
else {
entry.pi += ' | ' + tmp[i].pinyin
entry.po = ''
entry.en += ' | ' + tmp[i].definition_en
}
}
// GET FRENCH HSK2
tmp = dict._hsk2uni
for(i=0; i<tmp.length; i++){
char = tmp[i].simplified
char = char
.replace(/[\|\|\∣].+/, '')
.replace(/[\(\(].+[\)\)]/, '')
.replaceAll(/[¹²∙…\|\(\)12\|\(\)\∣、]/g, '')
// .replaceAll(/(.+?)[形名动介副量数连助代叹]+/g, '$1')
// .replace(/(.+)儿$/, '$1')
if(tmp[i].level.includes('C')) continue
entry = dict.get(char)
if(!entry) entry = dict.get(char.replace('儿', ''))
if(!entry && tmp[i].level.length == 5 && tmp[i].level.startsWith('hsk-')){
char2 = char.slice(2)
char = char.slice(0, 2)
tmp[i].definition_fr = tmp[i].definition_fr + ' (' + char + '……' + char2 + '……)'
entry = dict.get(char2)
if(!entry.fr){
entry.fr = tmp[i].definition_fr
}
else {
entry.fr += ' | ' + tmp[i].definition_fr
}
entry = dict.get(char)
}
if(!entry){
// console.log(tmp[i].simplified, get_cc_def(tmp[i].simplified))
char2 = get_cc_def(tmp[i].simplified)
dict.key.push(tmp[i].simplified)
dict.val.push({
id: dict.val.length,
si: tmp[i].simplified,
le: tmp[i].level.replace('A', '+').replace('B', '+').replace('hsk', 'old'),
tr: get_si_tr(tmp[i].simplified),
pi: tmp[i].pinyin,
po: '',
en: char2 ? '<ce>' + char2 : '',
fr: '<ge>' + tmp[i].definition_fr,
})
}
else if(!entry.fr){
entry.fr = '<ge>' + tmp[i].definition_fr
}
else {
entry.fr += ' | ' + tmp[i].definition_fr
}
}
// ADD EXAMPLE FR
tmp = dict._hsk2uni
for(i=0; i<tmp.length; i++){
if(!tmp[i].example) continue
entry = dict.get(tmp[i].simplified)
if(!entry) entry = dict.get(tmp[i].simplified.replace('儿',''))
entry.fr += '; ' + tmp[i].example
}
// ADD ALL FRENCH
tmp = dict.val
for(i=0; i<tmp.length; i++){
if(tmp[i].fr) continue
char2 = get_cc_def(tmp[i].si, 'f')
tmp[i].fr = char2 ? '<cf>' + char2 : ''
// console.log(tmp[i].si, get_cc_def(tmp[i].si, 'f'))
}
// ADD ZH DEF
out = []
tmp = dict.val
for(i=0; i<tmp.length; i++){
entry = dict._xiandai[tmp[i].si]
//entry ??= get_zdic_ci(tmp[i].si)
//entry ??= dict._zdic2[tmp[i].si]
if(!entry) {
// console.log(tmp[i].si)
tmp[i].zh = ''
out.push(tmp[i].si)
continue
}
tmp[i].zh = entry
}
scrape = out
return
// ADD COLOCATION
entry = []
tmp = dict.val
for(i=0; i<tmp.length; i++){
tmp[i].co = dict._coloc[tmp[i].si]
if(!tmp[i].co) entry.push(tmp[i].si)
}
window.discard = entry
return
// CHECK ENTRIES HYDCD
tmp = dict.key
entry = []
for(i=0; 0 && i<tmp.length; i++){
if(!dict._hydcd.includes(tmp[i]))
entry.push(tmp[i])
}
dict.discard = entry
}
async function new_merge_hsk(){
let key = dict.key = []
let val = dict.val = []
dict.get = get.bind(dict)
let char, char2, i, entry, z, cur, eng
z = {
sim: '', // simplified
lev: '', // level
cha: '', // character compound
col: '', // colocation
dec: '', // decomposition
eng: '', // english
fra: '', // francais
hom: '', // homonym
idx: '', // idx
mor: '', // more example
num: '', // numeric
pin: '', // pinyin
pos: '', // pos
ran: '', // rank
syn: '', // synonym
tra: '', // traditional
wor: '', // word compound
zho: '', // zhongwen
}
// GET HSK V3 CI
cur = dict._hsk3elk
for(i=0; i<cur.length; i++){
char = cur[i].simplified
entry = dict.get(char)
if(!entry){
key.push(char)
z.idx = val.length
z.sim = char
z.lev = 'new-' + cur[i].level.slice(-1)
val.push(structuredClone(z))
}
else {
// console.log(char)
}
}
// GET HSK V3 ZI
cur = dict._hsk3elk_zi
for(i=0; i<cur.length; i++){
char = cur[i].simplified
entry = dict.get(char)
if(!entry){
key.push(char)
z.idx = dict.val.length
z.sim = char
z.lev = 'char-' + cur[i].level
val.push(structuredClone(z))
}
else {
entry.lev += ' | char-' + cur[i].level
}
}
// GET HSK V2 CI
cur = dict._hsk2
for(i=0; i<cur.length; i++){
char = cur[i].simplified
if(char.includes('儿')){
if(!dict.key.includes(char)){
char = char.replace(/(.+)儿$/, '$1')
}
if(!dict.key.includes(char)){
console.warn(char)
}
}
entry = dict.get(char)
if(!entry){
key.push(char)
z.idx = val.length
z.sim = char
z.lev = 'old-' + cur[i].level.slice(-1)
val.push(structuredClone(z))
}
else {
if(!entry.lev.includes('old')){
dict.get(char).lev += ' | old-' + cur[i].level.slice(-1)
}
}
}
// GET BCT
cur = await load_bct()
for(i=0; i<cur.length; i++){
entry = dict._bct[+cur[i]]
if(dict.get(entry[2])) {
// console.log('bct', entry[2])
continue
}
key.push(entry[2])
z.idx = val.length
z.sim = entry[2]
z.lev = 'x-BCT'
val.push(structuredClone(z))
}
// GET HSK2 UNIGE
cur = dict._hsk2uni
for(i=0; i<cur.length; i++){
char = cur[i].simplified
if(dict.get(char)) continue
if(cur[i].level.at(-1) == 'C') continue
key.push(char)
z.idx = val.length
z.sim = char
z.lev = 'x-GE'
val.push(structuredClone(z))
}
// GET TBCL AND TOCFL
cur = await load_anki(1)
for(i=0; i<cur.sim.length; i++){
entry = cur.sim[i]
if(dict.get(entry)) {
console.log(entry)
continue
}
key.push(entry)
z.idx = val.length
z.sim = entry
z.lev = 'x-TW1'
val.push(structuredClone(z))
}
cur = await load_anki(2)
for(i=0; i<cur.sim.length; i++){
entry = cur.sim[i]
key.push(entry)
z.idx = val.length
z.sim = entry
z.lev = 'x-TW2'
val.push(structuredClone(z))
}
// GET TRADITIONNAL
cur = dict.val
for(i=0; i<cur.length; i++){
cur[i].tra = get_si_tr(cur[i].sim)
}
// GET DEF ENGLISH HSK V3
cur = dict._hsk3yar
for(i=0; i<cur.length; i++){
char = cur[i].simplified
entry = dict.get(char)
if(!entry.eng){
entry.eng = '<yd>' + cur[i].definition_en
}
else {
entry.eng += ' | ' + cur[i].definition_en
}
}
// GET DEF ENGLISH HSK V2
cur = dict._hsk2
for(i=0; i<cur.length; i++){
char = cur[i].simplified
entry = dict.get(char)
if(entry.lev.includes('new')) continue
if(!entry.eng){
entry.eng = '<py>' + cur[i].definition_en
}
else {
entry.eng += ' | ' + cur[i].definition_en
}
}
// GET ALL ENGLISH
cur = dict.val
for(i=0; i<cur.length; i++){
if(cur[i].eng) continue
char = cur[i].sim
entry = get_cc_def(char)
if(entry) {
cur[i].eng = '<ce>' + entry
}
entry = await load_wiktio(char, 0)
if(entry) {
cur[i].eng = '<wi>' + entry
}
}
// GET FRENCH HSK V2
cur = dict._hsk2uni
for(i=0; i<cur.length; i++){
char = cur[i].simplified
char2 = cur[i].simplified2
if(cur[i].level.includes('C')) continue
if(char2.includes('…')){
dict.get(char).fra = '<ge>' + cur[i].definition_fr + ' (' + char2 + ')'
dict.get(char2.replaceAll('…', '').slice(2, 4)).fra = '<ge>' + cur[i].definition_fr + ' (' + char2 + ')'
continue
}
entry = dict.get(char)
if(!entry.fra) {
entry.fra = '<ge>' + cur[i].definition_fr
}
else {
entry.fra += ' | ' + cur[i].definition_fr
}
}
// GET ALL FRENCH
cur = dict.val
for(i=0; i<cur.length; i++){
if(cur[i].fra) continue
char = cur[i].sim
entry = get_cc_def(char, 'f')
if(entry) {
cur[i].fra = '<cf>' + entry
}
else if(cur[i].eng) {
cur[i].fra = '<en>' + cur[i].eng.replace(/<.+?>/,'')
}
}
// GET PINYIN HSK V3
cur = dict._hsk3off
for(i=0; i<cur.length; i++){
char = cur[i][2]
// if(char == '地方') debugger
entry = dict.get(char)
if(!entry.pin){
entry.pin = cur[i][3]
}
else {
if(entry.pin.includes(cur[i][3])) continue
entry.pin += ' | ' + cur[i][3]
}
}
cur = dict._hsk3off_zi
for(i=0; i<cur.length; i++){
char = cur[i][2]
entry = dict.get(char)
if(!entry.lev.startsWith('char')) continue
if(!entry.pin){
entry.pin = cur[i][3]
}
else {
entry.pin += ' | ' + cur[i][3]
}
}
cur = dict._hsk2uni
for(i=0; i<cur.length; i++){
char = cur[i].simplified
entry = dict.get(char)
if(!( entry && (entry.lev.startsWith('old') || entry.lev.startsWith('x-GE')) )) continue
if(!entry.pin){
entry.pin = cur[i].pinyin
}
else {
entry.pin += ' | ' + cur[i].pinyin
}
}
cur = dict._hsk2
for(i=0; i<cur.length; i++){
char = cur[i].simplified
if(dict.get(char).pin) continue
// console.log('hsk2', char)
dict.get(char).pin = cur[i].pinyin
}
cur = await load_bct()
for(i=0; i<cur.length; i++){
char = dict._bct[+cur[i]][2]
entry = dict.get(char)
if(!( entry && entry.lev.startsWith('x-BCT') )) continue
// if(dict._bct[+cur[i]][3]=='shoddy') debugger
if(!entry.pin){
entry.pin = dict._bct[+cur[i]][1]
}
else {
// entry.pin += ' | ' + dict._bct[+cur[i]][1]
}
}
cur = await load_anki(1)
for(i=0; i<cur.sim.length; i++){
char = cur.sim[i]
entry = dict.get(char)
if(! entry?.lev.startsWith('x-TW1') ) continue
if(!entry.pin){
entry.pin = cur.all[i][3]
}
else {
entry.pin += ' | ' + cur.all[i][3]
}
}
cur = await load_anki(2)
for(i=0; i<cur.sim.length; i++){
char = cur.sim[i]
entry = dict.get(char)
if(! entry?.lev.startsWith('x-TW2') ) continue
if(!entry.pin){
entry.pin = cur.all[i][3]
}
else {
entry.pin += ' | ' + cur.all[i][3]
}
}
// for(let a of dict.val) if(!a.pin)console.log(a)
// for(let a of dict.val) if(a.sim.length > 1 && a.pin.includes('|'))console.log(a)
// GET NUMBERED PINYIN
cur = dict.val
for(i=0; i<cur.length; i++){
entry = cur[i]
entry.num = shapeSpell(entry.pin)
}
}
/*********/
// Generated by CoffeeScript 1.9.2
/*
PinyinConverter by David Chanin and Jen Liu @quizlet 2013
Inspired by http://stackoverflow.com/questions/1598856/convert-numbered-to-accentuated-pinyin/5607888#5607888
Authors: David Chanin and Jen Liu
Github: chanind
email: dchanin@quizlet.com
*/
/*
function pinyin_addaccents($string) {
# Find words with a number behind them, and replace with callback fn.
return preg_replace_callback(
'~([a-zA-ZüÜ]+)(\d)~',
'pinyin_addaccents_cb',
$string);
}
# Helper callback
function pinyin_addaccents_cb($match) {
static $accentmap = null;
if( $accentmap === null ) {
# Where to place the accent marks
$stars =
'a* e* i* o* u* ü* '.
'A* E* I* O* U* Ü* '.
'a*i a*o e*i ia* ia*o ie* io* iu* '.
'A*I A*O E*I IA* IA*O IE* IO* IU* '.
'o*u ua* ua*i ue* ui* uo* üe* '.
'O*U UA* UA*I UE* UI* UO* ÜE*';
$nostars = str_replace('*', '', $stars);
# Build an array like Array('a' => 'a*') and store statically
$accentmap = array_combine(explode(' ',$nostars), explode(' ', $stars));
unset($stars, $nostars);
}
static $vowels =
Array('a*','e*','i*','o*','u*','ü*','A*','E*','I*','O*','U*','Ü*');
static $pinyin = Array(
1 => Array('ā','ē','ī','ō','ū','ǖ','Ā','Ē','Ī','Ō','Ū','Ǖ'),
2 => Array('á','é','í','ó','ú','ǘ','Á','É','Í','Ó','Ú','Ǘ'),
3 => Array('ǎ','ě','ǐ','ǒ','ǔ','ǚ','Ǎ','Ě','Ǐ','Ǒ','Ǔ','Ǚ'),
4 => Array('à','è','ì','ò','ù','ǜ','À','È','Ì','Ò','Ù','Ǜ'),
5 => Array('a','e','i','o','u','ü','A','E','I','O','U','Ü')
);
list(,$word,$tone) = $match;
# Add star to vowelcluster
$word = strtr($word, $accentmap);
# Replace starred letter with accented
$word = str_replace($vowels, $pinyin[$tone], $word);
return $word;
}
*/
(function() {
var PinyinConverter;
PinyinConverter = {
pinyinRegex: /(shuang|chuang|zhuang|xiang|qiong|shuai|niang|guang|sheng|kuang|shang|jiong|huang|jiang|shuan|xiong|zhang|zheng|zhong|zhuai|zhuan|qiang|chang|liang|chuan|cheng|chong|chuai|hang|peng|chuo|piao|pian|chua|ping|yang|pang|chui|chun|chen|chan|chou|chao|chai|zhun|mang|meng|weng|shai|shei|miao|zhui|mian|yong|ming|wang|zhuo|zhua|shao|yuan|bing|zhen|fang|feng|zhan|zhou|zhao|zhei|zhai|rang|suan|reng|song|seng|dang|deng|dong|xuan|sang|rong|duan|cuan|cong|ceng|cang|diao|ruan|dian|ding|shou|xing|zuan|jiao|zong|zeng|zang|jian|tang|teng|tong|bian|biao|shan|tuan|huan|xian|huai|tiao|tian|hong|xiao|heng|ying|jing|shen|beng|kuan|kuai|nang|neng|nong|juan|kong|nuan|keng|kang|shua|niao|guan|nian|ting|shuo|guai|ning|quan|qiao|shui|gong|geng|gang|qian|bang|lang|leng|long|qing|ling|luan|shun|lian|liao|zhi|lia|liu|qin|lun|lin|luo|lan|lou|qiu|gai|gei|gao|gou|gan|gen|lao|lei|lai|que|gua|guo|nin|gui|niu|nie|gun|qie|qia|jun|kai|kei|kao|kou|kan|ken|qun|nun|nuo|xia|kua|kuo|nen|kui|nan|nou|kun|jue|nao|nei|hai|hei|hao|hou|han|hen|nai|rou|xiu|jin|hua|huo|tie|hui|tun|tui|hun|tuo|tan|jiu|zai|zei|zao|zou|zan|zen|eng|tou|tao|tei|tai|zuo|zui|xin|zun|jie|jia|run|diu|cai|cao|cou|can|cen|die|dia|xue|rui|cuo|cui|dun|cun|cin|ruo|rua|dui|sai|sao|sou|san|sen|duo|den|dan|dou|suo|sui|dao|sun|dei|zha|zhe|dai|xun|ang|ong|wai|fen|fan|fou|fei|zhu|wei|wan|min|miu|mie|wen|men|lie|chi|cha|che|man|mou|mao|mei|mai|yao|you|yan|chu|pin|pie|yin|pen|pan|pou|pao|shi|sha|she|pei|pai|yue|bin|bie|yun|nüe|lve|shu|ben|ban|bao|bei|bai|lüe|nve|ren|ran|rao|xie|re|ri|si|su|se|ru|sa|cu|ce|ca|ji|ci|zi|zu|ze|za|hu|he|ha|ju|ku|ke|qi|ka|gu|ge|ga|li|lu|le|qu|la|ni|xi|nu|ne|na|ti|tu|te|ta|xu|di|du|de|bo|lv|ba|ai|ei|ao|ou|an|en|er|da|wu|wa|wo|fu|fo|fa|nv|mi|mu|yi|ya|ye|me|mo|ma|pi|pu|po|yu|pa|bi|nü|bu|lü|e|o|a)r?[1-5]?/gi,
vowels: {
'a*': '0',
'e*': '1',
'i*': '2',
'o*': '3',
'u*': '4',
'ü*': '5',
'A*': '6',
'E*': '7',
'I*': '8',
'O*': '9',
'U*': '10',
'Ü*': '11'
},
pinyin: {
1: ['ā', 'ē', 'ī', 'ō', 'ū', 'ǖ', 'Ā', 'Ē', 'Ī', 'Ō', 'Ū', 'Ǖ'],
2: ['á', 'é', 'í', 'ó', 'ú', 'ǘ', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ǘ'],
3: ['ǎ', 'ě', 'ǐ', 'ǒ', 'ǔ', 'ǚ', 'Ǎ', 'Ě', 'Ǐ', 'Ǒ', 'Ǔ', 'Ǚ'],
4: ['à', 'è', 'ì', 'ò', 'ù', 'ǜ', 'À', 'È', 'Ì', 'Ò', 'Ù', 'Ǜ'],
5: ['a', 'e', 'i', 'o', 'u', 'ü', 'A', 'E', 'I', 'O', 'U', 'Ü']
},
convert: function(string) {
var j, len, match, matches, replacement;
matches = string.match(this.pinyinRegex);
if (!matches) {
return string;
}
for (j = 0, len = matches.length; j < len; j++) {
match = matches[j];
replacement = this.getReplacement(match);
string = string.replace(match, replacement);
}
return string;
},
getReplacement: function(match) {
var accentMap, accentedVowelChar, base, replacedWord, tone, vowel, vowelChar, vowelNum, word;
accentMap = this.getAccentMap();
tone = match.slice(-1);
word = match.slice(0, -1).replace('v', 'ü').replace('V', 'Ü');
for (base in accentMap) {
vowel = accentMap[base];
if (word.indexOf(base) >= 0) {
vowelChar = vowel.match(/.\*/)[0];
vowelNum = this.vowels[vowelChar];
accentedVowelChar = this.pinyin[tone.toString()][vowelNum];
replacedWord = word.replace(base, vowel).replace(vowelChar, accentedVowelChar);
return replacedWord;
}
}
return match;
},
getAccentMap: function() {
var base, i, j, len, nostars, ref, stars, starsArray;
if (!this.accentMap) {
stars = 'a*i a*o e*i ia* ia*o ie* io* iu* ' + 'A*I A*O E*I IA* IA*O IE* IO* IU* ' + 'o*u ua* ua*i ue* ui* uo* üe* ' + 'O*U UA* UA*I UE* UI* UO* ÜE* ' + 'A* E* I* O* U* Ü* ' + 'a* e* i* o* u* ü*';
nostars = stars.replace(/\*/g, '');
starsArray = stars.split(' ');
this.accentMap = {};
ref = nostars.split(' ');
for (i = j = 0, len = ref.length; j < len; i = ++j) {
base = ref[i];
this.accentMap[base] = starsArray[i];
}
}
return this.accentMap;
}
};
(function(root, factory) {
if (typeof define === 'function' && define.amd) {
return define(factory);
} else if (typeof exports === 'object') {
return module.exports = factory();
} else {
return root.PinyinConverter = factory();
}
})(this, function() {
return PinyinConverter;
});
}).call(this);
function shapeSpell(txt){
if(!txt) {
return ''
}
let tmp, i, j, out, re
try{
// tmp = cnchar.shapeSpell(txt, true)
for(i=1; i<=txt.length; i++){
out = txt.substring(0, i)
if(!out.match(/[^a-zA-Z]/)) continue
for(j=1; j<=5; j++){
re = new RegExp('(['+PinyinConverter.pinyin[j].join('')+'])')
if(re.test(out)){
tmp = txt.substring(0, i-1)
if(tmp.match(PinyinConverter.pinyinRegex)){
i = 200
break
}
out = out.replace(re, x=>{
let pos = PinyinConverter.pinyin[j].indexOf(x)
pos = Object.keys(PinyinConverter.vowels)[pos][0]
return pos
})
tmp = out + txt.substring(i) + j
i = 100
break
}
}
}
}
catch(e){
console.warn(txt)
return ''
}
if(i == 201) {
let aa = tmp.match(PinyinConverter.pinyinRegex)
if(aa) return aa[0] + ' ' + shapeSpell(txt.replace(aa[0], ''))
else return txt
}
if(!tmp) return txt
tmp = tmp.replace(/^\W+/, '')
re = PinyinConverter.pinyinRegex.toString().replace('/','').replace('/gi','')
re = new RegExp('^'+re+'$', 'gi')
for(i=1; i<tmp.length; i++){
if(!(tmp.slice(0, i+1)+'1').match(re)) {
if(!(tmp.slice(0, i+2)+'1').match(re)) {
if(!(tmp.slice(0, i+3)+'1').match(re)) {
break
}
}
}
}
if(tmp[i-1].match(/[^AEIOUaeiou0-9]/) && tmp[i].match(/[AEIOUaeiou0-9\u0080-\u0F00]/) && i+1 != tmp.length){
// if(cnchar.dict.spell[tmp.slice(0, i-1)]) i--
if(!(tmp.slice(0, i-1)+'1').match(PinyinConverter.pinyinRegex)) i--
}
j = tmp.slice(0, i).replace(/\d/, '') + tmp.slice(-1)
if(j.length == 2) {
debugger
console.warn(j)
return ''
}
out = j
tmp = tmp.slice(i, -1).trim()
out += (tmp.at(0) == '|' ? ' | ' : ' ') + shapeSpell(tmp)
return out
}
/*********/
(async ()=>{
window._ ??= await import('/js/0utils/utils3.mjs')
})()
async function scrape_sketchengine(){
let out, i, list
list = await fetch(dict._path + 'final/char.txt').then(x=>x.text())
list = list.split('\n')
out = {}
for(i=0; i<list.length; i++){
out['' + list[i]] = ''
}
out = {}
out = await fetch(dict._path + 'final/thesaurus.json').then(x=>x.json())
for(i in out) if(!out[i].includes('"status":"success"')) {
out[i] = ''
}
_.fetchAll(out, {max: 8, urlmodif: x=>{debugger
return {
url: 'https://server.chinesezerotohero.com/sketch-engine-proxy.php?https://app.sketchengine.eu/bonito/run.cgi/wsketch?corpname=preloaded/zhtenten17_simplified_stf2&lemma=' + x
}}
})
window.scrape = out
}
async function process_sketchengine_col(){
let out, i, j, k, list, tmp
list = await fetch(dict._path + 'final/coloc-5000.json').then(x=>x.json())
for(i in list){
list[i] = list[i].replaceAll(/\\u[a-f0-9]{4}/g, (...p)=>{
return JSON.parse('"'+p[0]+'"')
})
list[i] = JSON.parse(list[i])
if(list[i].status != 'success') {
debugger
console.warn(list[i].status)
}
}
out = {}
for(i in list){
if(!list[i].data.Gramrels) continue
tmp = []
for(j of list[i].data.Gramrels){
for(k of j.Words){
tmp.push(k)
}
}
tmp.sort((a,b)=>a.count>b.count?-1:a.count<b.count?1:0)
j = []
for(k of tmp){
if(j.includes(k.cm)) continue
j.push(k.cm)
if(j.length == 10) break
}
out[i] = j.join('、')
}
window.scrape2 = list
window.scrape3 = out
}
async function process_sketchengine_thes(){
let out, i, j, k, list, list2, tmp
list = await fetch(dict._path + 'final/thesaurus.json').then(x=>x.json())
list2 = await fetch(dict._path + 'final/thesaurus2.json').then(x=>x.json())
tmp = await fetch(dict._path + 'final/thesaurus3.json').then(x=>x.json())
list2.push(...tmp)
for(i in list2){
list2[i] = list2[i].replaceAll(/\\u[a-f0-9]{4}/g, (...p)=>{
return JSON.parse('"'+p[0]+'"')
})
if(list2[i].includes('429 Too Many')) continue
list2[i] = JSON.parse(list2[i])
list[list2[i].request.lemma] = list2[i]
}
tmp = []
for(i in list){
if(typeof list[i] != 'string') continue
list[i] = list[i].replaceAll(/\\u[a-f0-9]{4}/g, (...p)=>{
return JSON.parse('"'+p[0]+'"')
})
list[i] = JSON.parse(list[i])
if(list[i].status != 'success') {
debugger
tmp.push(i)
}
}
out = {}
for(i in list){
if(!list[i].data){
list[i].data = list[i]
}
if(!list[i].data.Words) continue
tmp = []
for(j of list[i].data.Words){
tmp.push(j.word)
}
out[i] = tmp
}
window.scrape2 = list
window.scrape3 = out
window.discard = tmp
}
/*
http://wap.51bc.net/xhy/page/xhy539.html
http://xh.51bc.net/html3/22525.html
http://xh.51bc.net/html4/16.html
{"derivation": "清·赵翼《论诗》诗矮人看戏何曾见,都是随人说短长。”", "example": "无", "explanation": "比喻只知道附和别人,自己没有主见。也比喻见识不广。", "pinyin": "ǎi rén kàn xì", "word": "矮人看戏", "abbreviation": "arkx"}
http://xh.51bc.net/html4/31250.html
{"ci":"", "explanation":""}
http://xh.51bc.net/html5/372021.html
*/
async function process_xinhua_ci(x, out){
let txt = await fetch(dict._path + 'final/xinhua-'+x+'.json').then(x=>x.json())
if(typeof out == 'undefined') out = []
for(let i in txt){
txt[i] = txt[i].replaceAll('<br>','')
let kk = txt[i].match(/center>([^<]+)<\/td>/s)?.at(1)
if(!kk) continue
let vv = txt[i].match(/'1'>(.+?)</s).at(1).trim()
// out[i] = {ci: kk, explanation: vv}
out.push({ci: kk, explanation: vv})
}
scrape = txt
scrape2 = out
return out
// aze = []; [0, 50_000, 100_000, 150_000, '150000b', 200_000, 250_000, 300_000, 350_000].forEach(async (x)=>{await process_xinhua_ci(x, aze)})
// aze[0] = []; for(let i of aze) if(i?.ci) aze[0].push(i.ci)
// aze.final = []; [0, 50_000, 100_000, 150_000, 200_000, 250_000, 300_000, 350_000].forEach(async (x)=>aze.final.push(...aze[x]))
}
async function process_xinhua_chengyu(x){
let txt
if(!x) txt = await fetch(dict._path + 'final/chengyu-0.json').then(x=>x.json())
else txt = await fetch(dict._path + x).then(x=>x.json())
let out = []
for(let i in txt){
txt[i] = txt[i].replaceAll('\r', '')
.replaceAll('〗', ':</b>')
.replaceAll('】', ':</b>')
.replaceAll('〖', '<b>')
.replaceAll('【', '<b>')
let word =
txt[i].match(/3><b>([^<]+?)</s)?.at(1)?.trim() ||
txt[i].match(/title1>([^<]+?)</s)?.at(1)?.trim() || ''
let pinyin =
txt[i].match(/拼音.+?'40%'>([^<]+?)</s)?.at(1)?.trim() ||
txt[i].match(/拼音:<\/b>([^<]+?)</s)?.at(1)?.trim() || ''
let explanation =
txt[i].match(/解释.+?'5'>([^<]+?)</s)?.at(1)?.trim() ||
txt[i].match(/解释:<\/b>([^<]+?)</s)?.at(1)?.trim() || ''
let example =
txt[i].match(/例子.+?'5'>([^<]+?)</s)?.at(1)?.trim() ||
txt[i].match(/例子:<\/b>([^<]+?)</s)?.at(1)?.trim() || ''
let abbreviation = ''
let derivation =
txt[i].match(/出处.+?'5'>([^<]+?)</s)?.at(1)?.trim() ||
txt[i].match(/出处:<\/b>([^<]+?)</s)?.at(1)?.trim() || ''
debugger
out.push({
derivation,
example,
explanation,
pinyin,
word,
abbreviation
})
}
scrape = txt
scrape2 = out
}
async function process_xinhua_pageciyu0(x){
let txt = await fetch(dict._path + 'final/words-0.json').then(x=>x.json())
let out = {}
for(let i in txt){
let ciyu = txt[i].match(/ciyu\/(.+\.html)>更多有关(.*?)的词语/s)
if(ciyu) out[ciyu.at(1)] = ciyu.at(2)
//out.push(ciyu)
}
out.links = []
for(let i in txt){
let ciyu
ciyu = txt[i].match(/ciyu\/(.+\.html)>更多有关(.*?)的词语/s)
if(ciyu) continue
ciyu = txt[i].match(/html5\/(z[\w\d]+\.html)/s)
if(ciyu) out.links.push(ciyu.at(1))
//out.push(ciyu)
}
scrape = txt
scrape2 = out
}
async function process_xinhua_pageciyu0A(x){
let txt = await fetch(dict._path + 'final/pageciyu-0A.json').then(x=>x.json())
let out = []
for(let i in txt){
let ciyu = txt[i].match(/z[a-zA-Z0-9].+?\.html/g) || []
//if(ciyu) out[ciyu.at(1)] = ciyu.at(2)
out.push(...ciyu)
}
scrape = txt
scrape2 = out
}
async function process_xinhua_pageciyu0B(x){
// word
let final
final = [];
for(x of [0, 50_000, 100_000, 150_000, '150000b', 200_000, 250_000, 300_000, 350_000]){
await process_xinhua_ci(x, final)
}
console.log(final.length)
let txt = await fetch(dict._path + 'final/pageciyu-0B.json').then(x=>x.json())
let out = await fetch(dict._path + 'final/pageciyu-0C.json').then(x=>x.json())
Object.assign(txt, out)
out = []
for(let i in txt){
let n = i.match(/(\d+)\./)?.at(1)
let ciyu = txt[i].match(/title1>([^<]+?)</s)?.at(1)
let def = txt[i].match(/body4'><br>(.+?)<\/div>/s)?.at(1).replaceAll('<br>','').trim()
//if(ciyu) out[ciyu.at(1)] = ciyu.at(2)
//out[n] = {ci: ciyu, explanation: def}
if(ciyu === undefined || ciyu.length == 1){
console.log(txt[i])
continue
}
out.push({ci: ciyu, explanation: def})
}
out.final = final
final.push(...out)
for(let i in final){
final[i].explanation = final[i].explanation.replaceAll('\r\n', '\n').replaceAll(/<[^>]+>/g, '').replaceAll(/&[#\w\d]+;/g, ' ').trim()
if(final[i].explanation.includes('&')) {debugger; console.log(final[i])}
}
scrape = txt
scrape2 = out
}
async function process_xinhua_pageciyu1(x){
let txt = await fetch(dict._path + 'final/words-0.json').then(x=>x.json())
let out = {}
for(let i in txt){
let ciyu = txt[i].match(/cy\/(.+\.html)>更多相关成语/s)
if(ciyu) {debugger
out[ciyu.at(1)] = i.ci
}
//out.push(ciyu)
}
out.links = []
for(let i in txt){
let ciyu
ciyu = txt[i].match(/cy\/(.+\.html)>更多相关成语/s)
if(ciyu) continue
ciyu = txt[i].match(/html4\/(z[\w\d]+\.html)/s)
if(ciyu) out.links.push(ciyu.at(1))
//out.push(ciyu)
}
scrape = txt
scrape2 = out
}
async function process_xinhua_pageciyu1A(x){
let txt = await fetch(dict._path + 'final/pageciyu-1A.json').then(x=>x.json())
let out = []
for(let i in txt){
let ciyu = txt[i].match(/z[a-zA-Z0-9].+?\.html/g) || []
//if(ciyu) out[ciyu.at(1)] = ciyu.at(2)
out.push(...ciyu)
}
scrape = txt
scrape2 = out
}
async function process_xinhua_pageciyu1B(x){
await process_xinhua_chengyu()
let aze = scrape2
await process_xinhua_chengyu('final/pageciyu-1B.json')
aze.push(...scrape2)
scrape2 = aze
}
async function process_xinhua_xhy(){
let txt = await fetch(dict._path + 'final/xhy-0.json').then(x=>x.json())
let out = []
for(let i in txt){
let ciyu = txt[i].matchAll(/([^<>]+)? +—+ +([^<>]+)/g)
ciyu = [...ciyu]
debugger
if(ciyu.length) {
for(let j of ciyu){
out.push({
riddle: j[1],
answer: j[2],
})
}
}
}
scrape = txt
scrape2 = out
}
async function process_xinhua_words(){
let txt = await fetch(dict._path + 'final/words-0.json').then(x=>x.json())
let out = []
for(let i in txt){
let word = txt[i].match(/(.)字的解释-/s)?.at(1) || ''
let oldword = ''
let strokes = txt[i].match(/笔划:[^#]+?>(\d+)</s)?.at(1) || ''
let pinyin = txt[i].match(/拼音:[^#]+?><\/td>([^#]+?)<\/td>/s)?.at(1)
.replaceAll('\t', '').replaceAll(/<[^>]+>/g, '').replaceAll(/ xhziplay\("[^"]+"\);/g, '').trim() || ''
let radicals = txt[i].match(/部首:[^#]+?<\/td>[^#]+?>([^#]+?)<\/td>/s)?.at(1) || ''
let explanation = txt[i].match(/基本解释(.+?)详细解释/s)?.at(1)
.replaceAll('<br>', '\n').replaceAll(/<[^>]+>/g, '').replaceAll(' ', ' ').substring(1).trim() || ''
let more = txt[i].match(/详细解释(.+?table4>)/s)?.at(1)
.replaceAll('<br>', '\n').replaceAll(/<[^>]+>/g, '').substring(1).trim() || ''
if(!word) continue
if(word) debugger
out.push({
word,
oldword,
strokes,
pinyin,
radicals,
explanation,
more
})
}
scrape = txt
scrape2 = out
}
async function scrape_xinhua_ci(x=0){
_ = window
let out, i, list
if(typeof scrape != 'undefined') out = scrape
else {
out = {}
for(i=x*50000+1; i<=539 && i<(x+1)*50000+1; i++){
out['' + i] = ''
}
}
_.fetchAll(out, {
max: 8,
responsetype: 'arrayBuffer',
// urlmodif: x=>'http://xh.5156edu.com/html5/'+x+'.html',
urlmodif: x=>'http://wap.51bc.net/xhy/page/xhy'+x+'.html',
valmodif: x=>{
let z = (new TextDecoder("gb2312")).decode(x)
x = z.includes('utf') ? (new TextDecoder("utf-8")).decode(x) : z
// x = x.match(/<table border=.+?<hr.+?<hr/s)?.at(0)
// debugger
return x
},
})
window.scrape = out
// for(i in scrape) if(!scrape[i]){console.log(i); scrape[i] = ''}
// download(JSON.stringify(scrape), 'xinhua-100000.txt')
}
async function load_baidu(x=''){
let out = []
if(window.scrape) out = scrape
let txt = await fetch(dict._path + 'final/baidu-'+x+'.json').then(x=>x.json())
for(let i in txt){
let sub = txt[i].match(/synonym.+antonym/s)?.at(0)
if(!sub) continue
let mm = sub.matchAll(/zici">([^<]+)/g)
let vv
let tt = txt[i].match(/<title>([^_]+)/s)?.at(1)
for(vv=mm.next(); !vv.done; vv=mm.next()){
if(out.includes(tt + '\t' + vv.value[1])) {
continue
}
else if(!out.includes(tt + '\t' + vv.value[1])) {
out.push(tt + '\t' + vv.value[1])
}
else if(!out.includes(vv.value[1] + '\t' + tt)) {
out.push(vv.value[1] + '\t' + tt)
}
}
// break
}
return out
}
async function scrape_baidu(){
let out, i, list, list2
list = await fetch(dict._path + 'final/char.txt').then(x=>x.text())
list = list.split('\n')
out = []
list2 = await fetch(dict._path + 'final/baidu-final.json').then(x=>x.json())
list2 = list2.join('\t').split('\t')
for(i of list2) if(!list.includes(i) && i.length == 2) out.push(i)
console.log(list2.length, out.length)
list2 = await fetch(dict._path + 'Keson96_SynoCN/syno_from_baidu_hanyu.txt').then(x=>x.text())
list2.replaceAll('\n', '\t').split('\t')
for(i of list2) if(!list.includes(i) && i.length == 2) out.push(i)
console.log(list2.length, out.length)
window.scrape = out
}
async function load_zdic3(){
let i, out, txt
txt = await fetch(dict._path + 'final/xinhua-final.json').then(x=>x.json())
out = {}
for(i=0; i<txt.length; i++){
if(txt[i].ci == undefined) continue
if(out[txt[i].ci]){
console.warn(txt[i].ci)
out[txt[i].ci] += '\n+ ' + txt[i].explanation
}
else out[txt[i].ci] = '+ ' + txt[i].explanation
}
dict._zdic3 = out
}
async function merge_zdic(){
let out = [];
let out2 = [];
for(i in dict._zdic3) {
if(!(i in dict._zdic2))
out.push(i);
else if(dict._zdic2[i].match(/^\+ /mg).length != dict._zdic3[i].match(/^\+ /mg).length)
out2.push(i)
}
window.scrape = out
window.scrape2 = out2
}
async function merge_zdic_chengyu(){
dict._chengyu = await fetch(dict._path + 'pwxcoo_chinese-xinhua/data/idiom.json').then(x=>x.json())
dict._chengyu2 = await fetch(dict._path + 'final/chengyu-final.json').then(x=>x.json())
dict._chengyu.keys = {}
dict._chengyu2.keys = {}
for(i in dict._chengyu) {
dict._chengyu.keys[dict._chengyu[i].word] = ''
}
for(i in dict._chengyu2) {
dict._chengyu2.keys[dict._chengyu2[i].word] = ''
}
let out = [];
for(i in dict._chengyu.keys) {
if(!(i in dict._chengyu2.keys))
out.push(i);
}
window.scrape = out
}
async function merge_zdic_xhy(){
dict._xhy = await fetch(dict._path + 'pwxcoo_chinese-xinhua/data/xiehouyu.json').then(x=>x.json())
dict._xhy2 = await fetch(dict._path + 'final/xhy-final.json').then(x=>x.json())
dict._xhy.keys = {}
dict._xhy2.keys = {}
for(i in dict._xhy) {
dict._xhy.keys[dict._xhy[i].riddle?.trim()] = ''
}
for(i in dict._xhy2) {
dict._xhy2.keys[dict._xhy2[i].riddle?.trim()] = ''
}
let out = [];
for(i in dict._xhy.keys) {
if(!(i in dict._xhy2.keys))
out.push(i);
}
window.scrape = out
}
async function merge_zdic_zi(){
if(!dict._zdic) await load_zdic()
dict._zdic_zi = await fetch(dict._path + 'final/words-final.json').then(x=>x.json())
dict._zdic_zi.keys = {}
for(i in dict._zdic_zi) {
dict._zdic_zi.keys[dict._zdic_zi[i].word?.trim()] = dict._zdic_zi[i]
}
let out = []
for(i in dict._zdic) {
if(!(i in dict._zdic_zi.keys)) out.push(i)
}
scrape = out
}