распознавание языка в простом JS (en / ger) - необходимы улучшения - PullRequest
0 голосов
/ 20 февраля 2019

Я написал простой код, чтобы определить погоду, сообщение написано на английском или немецком языке.«languageAnalysis ()» будет вызываться через «identifLanguge ()».См. Мой ответ для последнего.

Код работает, но я ищу:

  • Примеры текста на английском или немецком языке, которые не работают
  • улучшения вполучить лучшие результаты (примерные идеи :)
    • частота заглавных букв
    • частота пробелов
    • средняя длина предложений

Не стесняйтесь участвовать:)

function languageAnalysis(text) {

    indicator = [];
    indicatorReliability = [];
    indicatorType = [];

    germanIndicator = 0;
    englishIndicator = 0;

    language = "undefined";

    text = text.toLowerCase();



//BLOCK 1: Single Character frequency

    letters = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"];
    germanLetterChances = ["6.5","1.9","3.0","5.1","17.4","1.7","3.0","4.8","7.6","0.3","1.2","3.4","2.5","9.8","2.5","0.8","0.02","7.0","7.3","6.2","4.4","0.7","1.9","0.03","0.04","1.1"];
    englishLetterChances = ["8.2","1.5","2.8","4.3","12.7","2.2","2.0","6.1","7.0","0.2","0.8","4.0","2.4","6.7","7.5","1.9","0.1","6.0","6.3","9.1","2.8","1.0","2.4","0.2","2.0","0.1"];

    for(iiii = 0; iiii<letters.length; iiii++){

        if(text.match(letters[iiii])){

            frequency = text.split(letters[iiii]).length;
            frequency = parseInt(frequency)-1;
            frequency = (frequency/text.length)*100;

            gerLetterChance = parseFloat(germanLetterChances[iiii]);
            engLetterChance = parseFloat(englishLetterChances[iiii]);

            if(frequency > Math.max(gerLetterChance,engLetterChance)){

                if(Math.max(gerLetterChance,engLetterChance) == engLetterChance){
                    englishIndicator = englishIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
                }else{
                    germanIndicator = germanIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
                }

            }else if(frequency < Math.min(gerLetterChance,engLetterChance)){

                if(Math.min(gerLetterChance,engLetterChance) == engLetterChance){
                    englishIndicator = englishIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
                }else{
                    germanIndicator = germanIndicator + Math.max(gerLetterChance,engLetterChance) - Math.min(gerLetterChance,engLetterChance);
                }

            }else{

                if(frequency - Math.min(gerLetterChance,engLetterChance) > Math.max(gerLetterChance,engLetterChance) - frequency){

                    if(Math.max(gerLetterChance,engLetterChance) == engLetterChance){
                        englishIndicator = englishIndicator + frequency - Math.min(gerLetterChance,engLetterChance);
                    }else{
                        germanIndicator = germanIndicator + frequency - Math.min(gerLetterChance,engLetterChance);
                    }

                }else{
                    if(Math.min(gerLetterChance,engLetterChance) == engLetterChance){
                        englishIndicator = englishIndicator + Math.max(gerLetterChance,engLetterChance) - frequency;

                    }else{
                        germanIndicator = germanIndicator + Math.max(gerLetterChance,engLetterChance) - frequency;
                    }
                }
            }
        }
    }
    if(germanIndicator > englishIndicator){

        indicator.push("german");
        indicatorReliability.push(  (germanIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("Character-Frequency");

    }else if(englishIndicator > germanIndicator){

        indicator.push("english");
        indicatorReliability.push(  (englishIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("Character-Frequency");
    }








//BLOCK 2: Bigramm frequency

    germanIndicator = 0;
    englishIndicator = 0;

    bigramms = ["aa","ab","ac","ad","ae","af","ag","ah","ai","aj","ak","al","am","an","ao","ap","aq","ar","as","at","au","av","aw","ax","ay","az","ba","bb","bc","bd","be","bf","bg","bh","bi","bj","bk","bl","bm","bn","bo","bp","bq","br","bs","bt","bu","bv","bw","bx","by","bz","ca","cb","cc","cd","ce","cf","cg","ch","ci","cj","ck","cl","cm","cn","co","cp","cq","cr","cs","ct","cu","cv","cw","cx","cy","cz","da","db","dc","dd","de","df","dg","dh","di","dj","dk","dl","dm","dn","do","dp","dq","dr","ds","dt","du","dv","dw","dx","dy","dz","ea","eb","ec","ed","ee","ef","eg","eh","ei","ej","ek","el","em","en","eo","ep","eq","er","es","et","eu","ev","ew","ex","ey","ez","fa","fb","fc","fd","fe","ff","fg","fh","fi","fj","fk","fl","fm","fn","fo","fp","fq","fr","fs","ft","fu","fv","fw","fx","fy","fz","ga","gb","gc","gd","ge","gf","gg","gh","gi","gj","gk","gl","gm","gn","go","gp","gq","gr","gs","gt","gu","gv","gw","gx","gy","gz","ha","hb","hc","hd","he","hf","hg","hh","hi","hj","hk","hl","hm","hn","ho","hp","hq","hr","hs","ht","hu","hv","hw","hx","hy","hz","ia","ib","ic","id","ie","if","ig","ih","ii","ij","ik","il","im","in","io","ip","iq","ir","is","it","iu","iv","iw","ix","iy","iz","ja","jb","jc","jd","je","jf","jg","jh","ji","jj","jk","jl","jm","jn","jo","jp","jq","jr","js","jt","ju","jv","jw","jx","jy","jz","ka","kb","kc","kd","ke","kf","kg","kh","ki","kj","kk","kl","km","kn","ko","kp","kq","kr","ks","kt","ku","kv","kw","kx","ky","kz","la","lb","lc","ld","le","lf","lg","lh","li","lj","lk","ll","lm","ln","lo","lp","lq","lr","ls","lt","lu","lv","lw","lx","ly","lz","ma","mb","mc","md","me","mf","mg","mh","mi","mj","mk","ml","mm","mn","mo","mp","mq","mr","ms","mt","mu","mv","mw","mx","my","mz","na","nb","nc","nd","ne","nf","ng","nh","ni","nj","nk","nl","nm","nn","no","np","nq","nr","ns","nt","nu","nv","nw","nx","ny","nz","oa","ob","oc","od","oe","of","og","oh","oi","oj","ok","ol","om","on","oo","op","oq","or","os","ot","ou","ov","ow","ox","oy","oz","pa","pb","pc","pd","pe","pf","pg","ph","pi","pj","pk","pl","pm","pn","po","pp","pq","pr","ps","pt","pu","pv","pw","px","py","pz","qa","qb","qc","qd","qe","qf","qg","qh","qi","qj","qk","ql","qm","qn","qo","qp","qq","qr","qs","qt","qu","qv","qw","qx","qy","qz","ra","rb","rc","rd","re","rf","rg","rh","ri","rj","rk","rl","rm","rn","ro","rp","rq","rr","rs","rt","ru","rv","rw","rx","ry","rz","sa","sb","sc","sd","se","sf","sg","sh","si","sj","sk","sl","sm","sn","so","sp","sq","sr","ss","st","su","sv","sw","sx","sy","sz","ta","tb","tc","td","te","tf","tg","th","ti","tj","tk","tl","tm","tn","to","tp","tq","tr","ts","tt","tu","tv","tw","tx","ty","tz","ua","ub","uc","ud","ue","uf","ug","uh","ui","uj","uk","ul","um","un","uo","up","uq","ur","us","ut","uu","uv","uw","ux","uy","uz","va","vb","vc","vd","ve","vf","vg","vh","vi","vj","vk","vl","vm","vn","vo","vp","vq","vr","vs","vt","vu","vv","vw","vx","vy","vz","wa","wb","wc","wd","we","wf","wg","wh","wi","wj","wk","wl","wm","wn","wo","wp","wq","wr","ws","wt","wu","wv","ww","wx","wy","wz","xa","xb","xc","xd","xe","xf","xg","xh","xi","xj","xk","xl","xm","xn","xo","xp","xq","xr","xs","xt","xu","xv","xw","xx","xy","xz","ya","yb","yc","yd","ye","yf","yg","yh","yi","yj","yk","yl","ym","yn","yo","yp","yq","yr","ys","yt","yu","yv","yw","yx","yy","yz","za","zb","zc","zd","ze","zf","zg","zh","zi","zj","zk","zl","zm","zn","zo","zp","zq","zr","zs","zt","zu","zv","zw","zx","zy","zz"];

    germanBigramChances = ["8","31","27","11","64","15","30","20","5","1","7","59","28","102","0","4","0","51","53","46","75","2","3","0","1","2","16","1","0","1","101","0","3","1","12","0","1","9","0","1","8","0","0","9","6","4","14","0","1","0","1","1","2","0","0","2","1","0","0","243","1","0","14","1","0","0","2","0","0","0","1","0","0","0","0","0","0","0","54","3","1","13","228","3","4","2","93","1","3","5","4","6","9","3","0","10","11","6","16","3","4","0","0","3","26","45","25","51","23","26","50","57","193","3","19","63","55","402","6","13","1","410","140","55","36","14","23","2","1","11","19","2","0","9","25","12","3","1","7","0","1","5","1","2","9","1","0","18","4","20","24","1","1","0","0","1","20","3","0","12","147","2","3","3","19","1","3","9","3","5","6","1","0","14","18","18","11","4","3","0","0","3","70","4","1","14","103","2","4","3","23","1","3","25","11","19","18","1","0","37","11","47","11","4","9","0","0","3","7","7","76","20","163","5","38","12","1","1","12","25","27","168","20","2","0","17","79","78","3","5","1","0","0","5","7","0","0","0","9","5","0","0","0","0","0","0","0","0","2","0","0","0","0","0","5","0","0","0","0","0","28","1","0","2","26","1","1","1","7","0","1","10","1","1","24","1","0","13","5","14","9","1","1","0","0","1","45","7","2","14","65","5","6","2","61","1","7","42","3","4","14","2","0","2","22","27","13","3","2","0","0","3","40","6","1","8","50","4","4","3","44","2","3","4","23","3","15","7","0","2","10","8","14","4","3","0","0","2","68","23","5","187","123","19","94","17","65","5","25","10","23","43","18","10","0","10","74","59","33","18","29","0","0","25","3","8","15","7","25","6","5","9","1","1","3","31","17","64","1","6","0","50","19","9","3","3","7","0","1","6","16","0","0","3","10","6","0","2","4","0","0","4","0","0","11","5","0","23","1","3","4","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","2","0","0","0","0","0","80","25","9","67","112","18","27","19","52","4","23","18","20","31","30","9","0","15","54","49","48","12","17","0","0","14","36","10","89","20","99","7","13","9","65","2","11","9","12","7","28","22","0","8","76","116","15","9","10","0","2","7","57","8","1","35","186","5","10","14","59","2","4","11","9","9","15","3","0","31","50","23","26","8","21","0","1","26","3","8","16","5","78","27","8","4","2","0","3","7","21","120","0","5","0","33","48","23","1","3","2","0","0","1","3","0","0","0","37","0","0","0","9","0","0","0","0","0","43","0","0","0","0","0","0","0","0","0","0","0","34","0","0","0","48","0","0","0","36","1","0","0","0","1","17","0","0","0","1","0","9","0","0","0","0","0","0","0","0","0","0","0","0","0","1","0","0","0","0","0","1","0","0","0","0","1","0","0","0","0","0","0","0","0","0","0","1","0","0","0","0","0","0","1","1","0","0","0","0","0","1","0","0","0","0","0","0","0","4","1","0","1","28","0","1","0","11","0","1","2","1","0","2","0","0","0","1","7","43","1","9","0","0","1"];                   //sum: 10000

    englishBigramChances = ["1","32","39","15","0","10","18","0","16","0","10","77","18","177","2","31","1","106","67","124","12","24","7","0","27","1","8","0","0","0","58","0","0","0","6","2","0","21","1","0","11","0","0","6","5","0","25","0","0","0","19","0","44","0","12","0","55","1","0","46","15","0","8","16","0","0","59","1","0","7","1","38","16","0","1","0","0","0","45","18","4","10","39","12","2","3","57","1","0","7","9","5","37","7","1","10","32","39","8","4","9","0","6","0","65","11","64","107","39","23","20","15","40","1","2","46","43","125","46","32","14","154","145","80","7","16","41","17","17","0","21","2","9","1","25","14","1","6","26","1","0","10","3","2","38","3","0","4","8","42","11","1","4","0","1","0","11","2","1","1","32","3","1","16","10","0","0","4","1","3","23","1","0","21","7","18","8","0","2","0","1","0","84","1","2","1","251","2","0","5","72","0","0","3","1","2","46","1","0","8","3","22","2","0","7","0","1","0","18","7","55","16","37","27","10","0","0","0","8","39","32","169","63","3","0","21","106","88","0","14","1","1","0","4","0","0","0","0","2","0","0","0","0","0","0","0","0","0","4","0","0","0","0","0","4","0","0","0","0","0","0","0","0","0","28","0","0","0","8","0","0","0","0","3","3","0","0","0","2","1","0","0","3","0","3","0","34","7","8","28","72","5","1","0","57","1","3","60","4","1","28","2","2","2","12","19","8","2","5","0","47","0","56","9","1","2","48","0","0","1","26","0","0","0","5","3","28","16","0","0","6","6","13","0","2","0","3","0","54","7","31","118","64","8","75","9","37","3","3","10","7","9","65","7","0","5","51","110","12","4","15","1","14","0","9","18","18","16","3","94","3","3","13","0","5","17","44","145","23","29","0","118","37","53","96","13","36","0","4","2","21","1","0","0","40","0","0","7","8","0","0","29","0","0","28","26","42","3","14","7","0","1","0","2","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","0","20","0","0","0","0","0","57","4","14","16","148","6","6","3","77","1","11","12","15","17","54","8","0","18","39","63","6","5","10","0","17","0","75","13","21","6","84","13","6","30","42","0","2","6","14","19","71","24","2","6","41","121","30","2","27","0","4","0","56","14","6","9","94","5","1","325","135","0","0","12","14","8","121","8","0","30","32","53","22","4","16","0","21","0","18","5","17","11","11","1","12","2","5","0","0","28","9","33","2","17","0","49","42","45","0","0","0","1","1","1","15","0","0","0","53","0","0","0","19","0","0","0","0","0","6","0","0","0","0","0","0","0","0","0","0","0","32","0","3","4","30","1","0","48","37","0","0","4","1","10","17","2","0","1","3","6","1","1","2","0","0","0","3","0","5","0","1","0","0","0","4","0","0","0","0","0","1","4","0","0","0","1","1","0","0","0","0","0","11","11","10","4","12","3","5","5","18","0","0","6","4","3","28","7","0","5","17","21","1","3","14","0","0","0","0","0","0","0","5","0","0","0","2","0","0","1","0","0","0","0","0","0","0","0","0","0","0","0","0","1"];         //sum: 10000

    gerBigramFrequency = 0;
    engBigramFrequency = 0;

    for(iiii = 0; iiii<bigramms.length; iiii++){

        if(text.match(bigramms[iiii])){

            gerBigramFrequency = parseFloat(gerBigramFrequency) + parseFloat(germanBigramChances[iiii]);
            engBigramFrequency = parseFloat(engBigramFrequency) + parseFloat(englishBigramChances[iiii]);

        }
    }
    if(gerBigramFrequency > engBigramFrequency){

        indicator.push("german");
        indicatorReliability.push(  (gerBigramFrequency/(gerBigramFrequency+engBigramFrequency))*100  );
        indicatorType.push("Bigramm-Frequency");

        if(gerBigramFrequency > (2*engBigramFrequency)){
            indicator.push("german");
            indicatorReliability.push(100);
            indicatorType.push("Bigramm-Frequency-Obvious");
        }


    }else if(engBigramFrequency > gerBigramFrequency){

        indicator.push("english");
        indicatorReliability.push(  (engBigramFrequency/(gerBigramFrequency+engBigramFrequency))*100  );
        indicatorType.push("Bigramm-Frequency");

        if(engBigramFrequency > (2*gerBigramFrequency)){
            indicator.push("english");
            indicatorReliability.push(100);
            indicatorType.push("Bigramm-Frequency-Obvious");
        }

    }














//BLOCK 3: Trigram frequency

    germanIndicator = 0;
    englishIndicator = 0;


    trigramms = ["ich","nde","und","der","ing","and","ent"];
    germanTrigramChances = ["1.11","0.89","0.87","0.86","0.21","0.25","0.26"];
    englishTrigramChances = ["0.23","0.23","0.33","0.28","1.11","1.02","0.73"];

    for(iiii = 0; iiii<trigramms.length; iiii++){

        if(text.match(trigramms[iiii])){


            frequency = text.split(trigramms[iiii]).length;
            frequency = parseInt(frequency)-1;
            frequency = ((frequency*3)/text.length)*100;


            gerTrigramChance = parseFloat(germanTrigramChances[iiii]);
            engTrigramChance = parseFloat(englishTrigramChances[iiii]);

            if(frequency > Math.max(gerTrigramChance,engTrigramChance)){

                if(Math.max(gerTrigramChance,engTrigramChance) == engTrigramChance){
                    englishIndicator = englishIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
                }else{
                    germanIndicator = germanIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
                }

            }else if(frequency < Math.min(gerTrigramChance,engTrigramChance)){

                if(Math.min(gerTrigramChance,engTrigramChance) == engTrigramChance){
                    englishIndicator = englishIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
                }else{
                    germanIndicator = germanIndicator + Math.max(gerTrigramChance,engTrigramChance) - Math.min(gerTrigramChance,engTrigramChance);
                }

            }else{

                if(frequency - Math.min(gerTrigramChance,engTrigramChance) > Math.max(gerTrigramChance,engTrigramChance) - frequency){

                    if(Math.max(gerTrigramChance,engTrigramChance) == engTrigramChance){
                        englishIndicator = englishIndicator + frequency - Math.min(gerTrigramChance,engTrigramChance);
                    }else{
                        germanIndicator = germanIndicator + frequency - Math.min(gerTrigramChance,engTrigramChance);
                    }

                }else{
                    if(Math.min(gerTrigramChance,engTrigramChance) == engTrigramChance){
                        englishIndicator = englishIndicator + Math.max(gerTrigramChance,engTrigramChance) - frequency;
                    }else{
                        germanIndicator = germanIndicator + Math.max(gerTrigramChance,engTrigramChance) - frequency;
                    }
                }
            }
        }
    }
    if(germanIndicator > englishIndicator){

        indicator.push("german");
        indicatorReliability.push(  (germanIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("Trigramm-Frequency");

    }else if(englishIndicator > germanIndicator){

        indicator.push("english");
        indicatorReliability.push(  (englishIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("Trigramm-Frequency");
    }









//BLOCK 4: German characters

    germanLetters = ["ä","ö","ü","ß"];
    gerGermanLettersChance = ["0.51","0.36","0.64","0.19"];

    germanLettersIncl = germanLetters.filter(el => text.includes(el));
    if(germanLettersIncl.length > 0){
        indicator.push("german");
        indicatorReliability.push("100");
        indicatorType.push("German-Characters");
    }









//BLOCK 5: First Letter Frequency

    firstLetters = ["a","c","d","e","o","t"];
    germanFirstChances = ["6","0.1","14.25","7.9","1","1.5"];
    englishFirstChances = ["11.7","5.24","3.17","2.8","7.63","15.98"];

    germanIndicator = 0;
    englishIndicator = 0;

    words = text.replace(/["   "]/g," ");
    words = words.replace(/["  "]/g," ");

    words = words.replace(/[" ? "]/g,"?");
    words = words.replace(/["? "]/g,"?");
    words = words.replace(/[" ?"]/g,"?");
    words = words.replace(/[?]/g,"§");

    words = words.replace(/[" ! "]/g,"!");
    words = words.replace(/["! "]/g,"!");
    words = words.replace(/[" !"]/g,"!");
    words = words.replace(/[!]/g,"§");

    words = words.replace(/[" . "]/g,".");
    words = words.replace(/[". "]/g,".");
    words = words.replace(/[" ."]/g,".");
    words = words.replace(/[.]/g,"§");

    words = words.replace(/[" , "]/g,",");
    words = words.replace(/[", "]/g,",");
    words = words.replace(/[" ,"]/g,",");
    words = words.replace(/[,]/g,"§");

    words = words.replace(/[" : "]/g,":");
    words = words.replace(/[": "]/g,":");
    words = words.replace(/[" :"]/g,":");
    words = words.replace(/[:]/g,"§");

    words = words.replace(/[" ; "]/g,";");
    words = words.replace(/["; "]/g,";");
    words = words.replace(/[" ;"]/g,";");
    words = words.replace(/[;]/g,"§");

    words = words.replace(/[" ' "]/g,"'");
    words = words.replace(/["' "]/g,"'");
    words = words.replace(/[" '"]/g,"'");
    words = words.replace(/[']/g,"§");

    words = words.replace(/[" "]/g,"§")

    words = words.split("§");



    firstLetterStr = "";
    for(iiii=0; iiii < words.length-1; iiii++){

        firstLetterStr = firstLetterStr + words[iiii].charAt(0);

    }
    for(iiii = 0; iiii<firstLetters.length; iiii++){

        if(firstLetterStr.match(firstLetters[iiii])){


            frequency = firstLetterStr.split(firstLetters[iiii]).length;
            frequency = parseInt(frequency)-1;
            frequency = (frequency/firstLetterStr.length)*100;


            gerFirstChance = parseFloat(germanFirstChances[iiii]);
            engFirstChance = parseFloat(englishFirstChances[iiii]);


            if(frequency > Math.max(gerFirstChance,engFirstChance)){

                if(Math.max(gerFirstChance,engFirstChance) == engFirstChance){
                    englishIndicator = englishIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
                }else{
                    germanIndicator = germanIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
                }

            }else if(frequency < Math.min(gerFirstChance,engFirstChance)){

                if(Math.min(gerFirstChance,engFirstChance) == engFirstChance){
                    englishIndicator = englishIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
                }else{
                    germanIndicator = germanIndicator + Math.max(gerFirstChance,engFirstChance) - Math.min(gerFirstChance,engFirstChance);
                }

            }else{

                if(frequency - Math.min(gerFirstChance,engFirstChance) > Math.max(gerFirstChance,engFirstChance) - frequency){

                    if(Math.max(gerFirstChance,engFirstChance) == engFirstChance){
                        englishIndicator = englishIndicator + frequency - Math.min(gerFirstChance,engFirstChance);
                    }else{
                        germanIndicator = germanIndicator + frequency - Math.min(gerFirstChance,engFirstChance);
                    }

                }else{
                    if(Math.min(gerFirstChance,engFirstChance) == engFirstChance){
                        englishIndicator = englishIndicator + Math.max(gerFirstChance,engFirstChance) - frequency;
                    }else{
                        germanIndicator = germanIndicator + Math.max(gerFirstChance,engFirstChance) - frequency;
                    }
                }
            }
        }
    }
    if(germanIndicator > englishIndicator){

        indicator.push("german");
        indicatorReliability.push(  (germanIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("First-Letters");

    }else if(englishIndicator > germanIndicator){

        indicator.push("english");
        indicatorReliability.push(  (englishIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("First-Letters");
    }






//BLOCK 6: Frequent words

    germanIndicator = 0;
    englishIndicator = 0;

    gerfrequentWords = ["der","die","und","millionen","den","von","zu","das","mit","sich","des","auf","für","ist","im","dem","nicht","ein","seiner","eine","als","auch","es","worden","werden","aus","er","hat","dass","was","nach","wird","bei","einer","will","um","am","sind","noch","wie","einem","über","einen","zwischen","so","sie","zum","war","haben","nur","oder","aber","vor","zur","bis","mehr","durch","man","sein","wurde","sei","sagte","prozent","hatte","kann","gegen","vom","können","schon","wenn","habe","seine","mark","ihre","dann","unter","wir","soll","ich","eines","immer","jahr","zwei","jahren","diese","dieser","wieder","keine","uhr","tag","du","mein","gut","heute"];

    engfrequentWords = ["the","of","and","a","to","is","you","that","it","he","was","for","on","are","as","with","his","they","i","at","be","this","have","from","or","one","had","by","word","but","not","what","all","were","we","when","your","can","said","there","use","each","which","she","do","how","their","if","will","up","other","about","out","many","then","them","these","so","some","her","would","make","like","him","into","time","has","look","two","more","write","go","see","may","no","way","could","made","my","than","first","come","been","call","who","oil","its","now","find","long","down","day","did","get"];

    for(iiii = 0; iiii<words.length; iiii++){

        for(iiiii = 0; iiiii<gerfrequentWords.length; iiiii++){

            if(words[iiii] == gerfrequentWords[iiiii]){
                germanIndicator++;
                break;
            }
            if(words[iiii] == engfrequentWords[iiiii]){
                englishIndicator++;
                break;
            }
        }
    }


    if(germanIndicator > englishIndicator){

        indicator.push("german");
        indicatorReliability.push(  (germanIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("Frequent-Words");

        if(germanIndicator > 4 && englishIndicator == 0){
            indicator.push("german");
            indicatorReliability.push(100);
            indicatorType.push("Frequent-Words-Obvious");
        }

    }else if(englishIndicator > germanIndicator){

        indicator.push("english");
        indicatorReliability.push(  (englishIndicator/(germanIndicator+englishIndicator))*100  );
        indicatorType.push("Frequent-Words");

        if(englishIndicator > 4 && germanIndicator == 0){
            indicator.push("english");
            indicatorReliability.push(100);
            indicatorType.push("Frequent-Words-Obvious");
        }

    }







//BLOCK 7: Trigram-Frequency-Comparison (Language-based-Collections)

    gerTrigrams = ["ein","die","che","end","gen","sch","cht","den","nge","nun","ung","das","hen","ind","enw","ens","ies"];
    gerTrigramChance = ["1.22","0.87","0.75","0.75","0.71","0.66","0.61","0.57","0.52","0.48","0.48","0.47","0.47","0.46","0.45","0.44","0.44"];    //sum: 10.35
    engTrigrams = ["the","ion","tio","her","ate","tha","ati","for","hat","his","res","ill"];
    engTrigramChance = ["3.54","0.76","0.76","0.69","0.67","0.63","0.61","0.61","0.56","0.53","0.51","0.48"];                   //sum: 10.35

    gerTrigramFrequency = 0;
    for(iiii=0; iiii<gerTrigrams.length; iiii++){

        if(text.includes(gerTrigrams[iiii])){
            gerTrigramFrequency = parseFloat(gerTrigramFrequency) + parseFloat(gerTrigramChance[iiii]);
        }

    }
    engTrigramFrequency = 0;
    for(iiii=0; iiii<engTrigrams.length; iiii++){

        if(text.includes(engTrigrams[iiii])){
            engTrigramFrequency = parseFloat(engTrigramFrequency) + parseFloat(engTrigramChance[iiii]);
        }

    }
    if(gerTrigramFrequency > engTrigramFrequency){

        indicator.push("german");
        indicatorReliability.push(  (gerTrigramFrequency/(gerTrigramFrequency+engTrigramFrequency))*100  );
        indicatorType.push("Trigram-Comparison");

    }else if(engTrigramFrequency > gerTrigramFrequency){

        indicator.push("english");
        indicatorReliability.push(  (engTrigramFrequency/(gerTrigramFrequency+engTrigramFrequency))*100  );
        indicatorType.push("Trigram-Comparison");

    }











//FINAL MEASURE


    germanIndicator = 0;
    englishIndicator = 0;


    if(indicator.length>0){


        for(iiii = 0; iiii < indicator.length; iiii++){

            if(indicator[iiii] == "german"){

                germanIndicator = parseFloat(germanIndicator) + parseFloat(indicatorReliability[iiii]);
                englishIndicator = parseFloat(englishIndicator) + (100-parseFloat(indicatorReliability[iiii]));


            }else{

                englishIndicator = parseFloat(englishIndicator) + parseFloat(indicatorReliability[iiii]);
                germanIndicator = parseFloat(germanIndicator) + (100-parseFloat(indicatorReliability[iiii]));


            }

        }


        indicatorSum = parseFloat(germanIndicator) + parseFloat(englishIndicator);

        germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
        englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;


        if(germanIndicator > englishIndicator){
            result = "german" + germanIndicator;
        }else if(englishIndicator > germanIndicator){
            result = "english" + englishIndicator;
        }else{
            result = "undefined";
        }

    }else{
        result = "undefined";
    }




    results = "ANALYSIS-RESULTS (" + text + "):  ";


    if(indicator.length>0){
        for(iiii = 0; iiii < indicator.length; iiii++){

            results = results + indicatorType[iiii] + ":" + indicator[iiii] + "(" + indicatorReliability[iiii] + "%); ";
        }
    }



    console.log(results);


    return result;
}

1 Ответ

0 голосов
/ 20 февраля 2019

Я использую функцию identifierLanguage () для вызова languageAnalysis ().На основе результата всего сообщения анализ может быть выполнен снова, на этот раз с каждым отдельным предложением сообщения, чтобы получить лучший конечный результат.

function identifyLanguage(message){

    messageResult = languageAnalysis(message);
    messageReliability = 0;


    if(messageResult.includes("german")){
        messageReliability = messageResult.replace("german","");
        messageLanguage = "german";
    }else if(messageResult.includes("english")){
        messageReliability = messageResult.replace("english","");
        messageLanguage = "english";
    }


    if(parseFloat(messageReliability) < 70){


        if(messageResult.includes("undefined")){
            console.warn("undefined");
        }else{
            console.warn(messageLanguage + " (" + messageReliability + "% reliable)");
        }


        sentences = message.replace(/[?]/g,"?§").replace(/[!]/g,"!§").replace(/[.]/g,".§").replace(/[:]/g,":§");
        sentences = sentences.split("§");
        sentencesCount = sentences.length-1;

        if(sentencesCount > 1){


            sentencesLanguage = [];
            sentencesReliability = [];
            for(iiiix=0; iiiix<sentencesCount; iiiix++){


                sentenceToAnalyse = sentences[iiiix];
                while(sentenceToAnalyse.charAt(0) === " ") {
                    sentenceToAnalyse = sentenceToAnalyse.substr(1);
                }

                sentenceResult = languageAnalysis(sentenceToAnalyse);


                if(sentenceResult.includes("german")){
                    sentenceReliability = sentenceResult.replace("german","")
                    sentencesReliability.push(sentenceReliability);
                    sentencesLanguage.push("german");
                    console.log("german (" + sentenceResult.replace("german","") + "% reliable)");

                }else if(sentenceResult.includes("english")){
                    sentenceReliability = sentenceResult.replace("english","")
                    sentencesReliability.push(sentenceReliability);
                    sentencesLanguage.push("english");
                    console.log("english (" + sentenceResult.replace("english","") + "% reliable)");
                }else{
                    console.log("undefined");
                }

            }





//FINAL MEASURE (IF SINGLE SENTENCES CHECKED)


            germanIndicator = 0;
            englishIndicator = 0;


            if(sentencesLanguage.length>0){


                for(iiii = 0; iiii < sentencesLanguage.length; iiii++){


                    if(sentencesLanguage[iiii] == "german"){

                        germanIndicator = parseFloat(germanIndicator) + parseFloat(sentencesReliability[iiii]);
                        englishIndicator = parseFloat(englishIndicator) + (100-parseFloat(sentencesReliability[iiii]));


                    }else{

                        englishIndicator = parseFloat(englishIndicator) + parseFloat(sentencesReliability[iiii]);
                        germanIndicator = parseFloat(germanIndicator) + (100-parseFloat(sentencesReliability[iiii]));


                    }

                }



                indicatorSum = parseFloat(germanIndicator) + parseFloat(englishIndicator);

                germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
                englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;


                if(germanIndicator > englishIndicator){

                    if(messageResult.includes("undefined")){
                        language = "german (" + germanIndicator + "% reliable)";        

                    }else if(messageResult.includes("german")){
                        germanIndicator = (parseFloat(germanIndicator) + parseFloat(messageReliability))/2;
                        language = "german (" + germanIndicator + "% reliable)";

                    }else if(messageResult.includes("english")){

                        indicatorSum = parseFloat(germanIndicator) + parseFloat(messageReliability);

                        germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
                        englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;

                        if(germanIndicator > englishIndicator){
                            language = "german (" + germanIndicator + "% reliable)";
                        }else if(englishIndicator > germanIndicator){
                            language = "english (" + englishIndicator + "% reliable)";
                        }else{
                            language = "undefined";
                        }
                    }

                }else if(englishIndicator > germanIndicator){

                    if(messageResult.includes("undefined")){
                        language = "english (" + englishIndicator + "% reliable)";      

                    }else if(messageResult.includes("english")){
                        englishIndicator = (parseFloat(englishIndicator) + parseFloat(messageReliability))/2;
                        language = "english (" + englishIndicator + "% reliable)";

                    }else if(messageResult.includes("german")){

                        indicatorSum = parseFloat(germanIndicator) + parseFloat(messageReliability);

                        germanIndicator = ( parseFloat(germanIndicator) / parseFloat (indicatorSum) ) *100;
                        englishIndicator = ( parseFloat(englishIndicator) / parseFloat (indicatorSum) ) *100;

                        if(germanIndicator > englishIndicator){
                            language = "german (" + germanIndicator + "% reliable)";
                        }else if(englishIndicator > germanIndicator){
                            language = "english (" + englishIndicator + "% reliable)";
                        }else{
                            language = "undefined";
                        }
                    }

                }else{
                    language = "undefined";
                }

            }else{
                language = "undefined";
            }




        }else{

            if(messageResult.includes("undefined")){
                language = "undefined";
            }else{
                language = messageLanguage + " (" + messageReliability + "% reliable)";
            }

        }

    }else{

        language = messageLanguage + " (" + messageReliability + "% reliable)";

    }


    console.info(language);

    return language;

}







message = "Hello you! How are you?";
alert( identifyLanguage(message) );
Добро пожаловать на сайт PullRequest, где вы можете задавать вопросы и получать ответы от других членов сообщества.
...