Автор: Stas Malinovski
Методом таблицы модельных распределений:
Code: |
type TCodePage = (cpWin1251, cp866, cpKOI8R); PMap = ^TMap; TMap = array[#$80..#$FF] of Char;
function GetMap(CP: TCodePage): PMap; { должна возвращать указатель на таблицу перекодировки из CP в Windows1251 (nil для CP = cpWin1251) } begin GetMap := nil; end;
function DetermineRussian(Buf: PChar; Count: Integer): TCodePage; const ModelBigrams: array[0..33, 0..33] of Byte = ( {АБВГДЕЖЗИЙКЛМHОПРСТУФХЦЧШЩЪЫЬЭЮЯ_?} {А}(0, 20, 44, 12, 22, 23, 16, 60, 4, 9, 63, 93, 47, 110, 0, 16, 35, 61, 81, 1, 5, 13, 24, 17, 12, 4, 0, 0, 0, 0, 14, 31, 205, 1), {Б}(19, 0, 0, 0, 4, 19, 0, 0, 8, 0, 2, 15, 1, 4, 41, 0, 15, 5, 0, 15, 0, 2, 1, 0, 0, 6, 16, 37, 0, 0, 0, 4, 3, 0), {В}(97, 0, 1, 0, 2, 57, 0, 5, 40, 0, 4, 25, 2, 23, 78, 2, 8, 28, 4, 12, 0, 1, 0, 0, 8, 1, 0, 40, 1, 0, 0, 5, 106, 3), {Г}(13, 0, 0, 0, 9, 5, 0, 0, 15, 0, 1, 17, 1, 2, 96, 0, 24, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0), {Д}(63, 0, 9, 1, 2, 71, 1, 0, 35, 0, 3, 16, 2, 22, 50, 2, 19, 9, 2, 25, 0, 2, 1, 0, 1, 0, 1, 9, 4, 0, 1, 5, 17, 4), {Е}(4, 14, 15, 34, 56, 22, 13, 14, 2, 34, 39, 77, 73, 150, 6, 9, 101, 64, 81, 1, 0, 15, 5, 12, 10, 6, 0, 0, 0, 0, 3, 4, 235, 1), {Ж}(13, 0, 0, 0, 12, 47, 0, 0, 16, 0, 1, 0, 0, 23, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2), {З}(76, 2, 11, 3, 11, 4, 1, 0, 7, 0, 2, 4, 11, 24, 17, 0, 6, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 16, 6, 0, 1, 4, 17, 0), {И}(7, 9, 32, 5, 18, 60, 4, 42, 31, 27, 28, 46, 55, 49, 12, 7, 26, 60, 53, 0, 5, 25, 14, 28, 4, 1, 0, 0, 0, 0, 9, 56, 255, 0), {Й}(0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 3, 0, 3, 0, 0, 0, 10, 3, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 122, 0), {К}(92, 0, 3, 0, 0, 7, 2, 1, 39, 0, 0, 27, 0, 14, 110, 0, 18, 5, 35, 18, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0), {Л}(85, 1, 0, 2, 1, 70, 6, 0, 85, 0, 5, 3, 0, 9, 67, 1, 0, 9, 0, 15, 0, 0, 0, 2, 0, 0, 0, 9, 66, 0, 15, 43, 57, 4), {М}(44, 0, 0, 0, 0, 65, 0, 0, 47, 0, 1, 1, 10, 15, 57, 7, 0, 2, 0, 24, 0, 0, 0, 0, 0, 0, 0, 28, 0, 0, 0, 8, 109, 3), {}(139, 0, 0, 1, 11, 108, 0, 4, 152, 0, 7, 0, 1, 69, 161, 0, 0, 8, 25, 24, 5, 1, 5, 2, 0, 1, 0, 83, 10, 0, 1, 29, 38, 5), {О}(0, 72, 139, 76, 74, 32, 32, 19, 12, 52, 21, 93, 68, 72, 7, 34, 93, 102, 98, 1, 2, 6, 6, 19, 15, 2, 0, 0, 0, 1, 4, 9, 252, 2), {П}(17, 0, 0, 0, 0, 43, 0, 0, 14, 0, 1, 9, 0, 1, 125, 3, 120, 1, 2, 8, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 3, 2, 2), {Р}(151, 1, 6, 4, 3, 103, 7, 0, 76, 0, 4, 0, 11, 10, 117, 1, 0, 5, 9, 39, 2, 5, 0, 1, 3, 0, 0, 24, 7, 0, 1, 10, 22, 5), {С}(24, 1, 21, 0, 3, 39, 0, 0, 33, 0, 56, 41, 11, 15, 58, 30, 5, 30, 183, 16, 0, 4, 1, 4, 1, 0, 0, 8, 25, 0, 1, 50, 41, 2), {Т}(83, 0, 43, 0, 3, 87, 0, 0, 71, 0, 9, 3, 2, 26, 180, 0, 55, 33, 1, 23, 1, 0, 1, 4, 0, 0, 0, 20, 78, 0, 0, 5, 82, 4), {У}(3, 6, 7, 14, 19, 8, 13, 6, 0, 1, 13, 15, 10, 7, 0, 12, 17, 16, 19, 0, 1, 3, 0, 12, 5, 8, 0, 0, 0, 0, 22, 1, 65, 0), {Ф}(4, 0, 0, 0, 0, 4, 0, 0, 11, 0, 0, 1, 0, 0, 9, 0, 3, 0, 0, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0), {Х}(9, 0, 2, 0, 0, 2, 0, 0, 5, 0, 0, 1, 0, 5, 26, 0, 4, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76, 0), {Ц}(5, 0, 0, 0, 0, 16, 0, 0, 48, 0, 1, 0, 0, 0, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0), {Ч}(30, 0, 0, 0, 0, 52, 0, 0, 23, 0, 3, 1, 0, 14, 1, 0, 0, 0, 36, 5, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 2), {Ш}(13, 0, 0, 0, 0, 28, 0, 0, 17, 0, 4, 4, 0, 4, 3, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 1), {Щ}(6, 0, 0, 0, 0, 23, 0, 0, 16, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1), {Ъ}(0, 0, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0), {Ы}(0, 5, 14, 1, 3, 28, 0, 2, 0, 22, 6, 19, 21, 2, 0, 5, 4, 7, 10, 0, 0, 37, 0, 3, 4, 0, 0, 0, 0, 0, 0, 1, 84, 0), {Ь}(0, 1, 0, 0, 0, 9, 0, 10, 1, 0, 13, 0, 2, 26, 0, 0, 0, 10, 3, 0, 0, 0, 1, 0, 6, 0, 0, 0, 0, 0, 6, 4, 117, 0), {Э}(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 31, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), {Ю}(0, 5, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 15, 0, 0, 0, 1, 4, 1, 15, 0, 0, 0, 0, 0, 0, 38, 0), {Я}(0, 0, 9, 2, 7, 10, 3, 19, 0, 0, 1, 6, 7, 8, 0, 0, 2, 6, 19, 0, 0, 3, 5, 1, 0, 3, 0, 0, 0, 0, 5, 2, 177, 0), {_}(42, 80, 193, 43, 109, 41, 18, 53, 159, 0, 144, 27, 83, 176, 187, 229, 70, 231, 99, 47, 15, 13, 6, 58, 7, 0, 0, 0, 0, 38, 0, 22, 0, 2), {?}(0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 2, 4, 4, 8, 0, 0, 5, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); { " рейтинг" буквы ? условно принимается равным 1/20 от " рейтинга" буквы E, если сочетание с участием ? корректно, иначе - 0 } type TVariation = array[0..33, 0..33] of Integer; var I, J, iC, iPredC, Max: Integer; C: Char; CP: TCodePage; D, MinD, Factor: Double; AMap: PMap; PV: ^TVariation; Vars: array[TCodePage] of TVariation; begin DetermineRussian := cpWin1251; { по yмолчанию } { вычисление распределений биграмм } FillChar(Vars, SizeOf(Vars), 0); for CP := Low(Vars) to High(Vars) do begin AMap := GetMap(CP); PV := @Vars[CP]; iPredC := 32; for I := 0 to Count - 1 do begin C := Buf[I]; iC := 32; if C > = #128 then begin if AMap < > nil then C := AMap^[C]; if not (C in ['?', '?']) then begin C := Chr(Ord(C) and not 32); { 'a'..'я' -> 'А'..'Я' } if C in ['А'..'Я'] then iC := Ord(C) - Ord('А'); end else iC := 33; end; Inc(PV^[iPredC, iC]); iPredC := iC; end; end; { вычисление метрики и определение наиболее правдоподобной кодировки } MinD := 0; for CP := Low(Vars) to High(Vars) do begin PV := @Vars[CP]; PV^[32, 32] := 0; Max := 1; for I := 0 to 33 do for J := 0 to 33 do if PV^[I, J] > Max then Max := PV^[I, J]; Factor := 255 / Max; { ноpмализация } D := 0; for I := 0 to 33 do for J := 0 to 33 do D := D + Abs(PV^[I, J] * Factor - ModelBigrams[I, J]); if (MinD = 0) or (D < MinD) then begin MinD := D; DetermineRussian := CP; end; end; end;
begin { тест: слово 'Пример' в разных кодировках (веpоятность ошибок на таких коpотких текстах высока - в данном слyчае пpосто повезло!) } writeln(DetermineRussian(#$CF#$F0#$E8#$EC#$E5#$F0, 6) = cpWin1251); writeln(DetermineRussian(#$8F#$E0#$A8#$AC#$A5#$E0, 6) = cp866); writeln(DetermineRussian(#$F0#$D2#$C9#$CD#$C5#$D2, 6) = cpKOI8R); readln; end. |
Просьба писать ваши замечания, наблюдения и все остальное,
что поможет улучшить предоставляемую информацию на этом сайте.
ВСЕ КОММЕНТАРИИ МОДЕРИРУЮТСЯ ВРУЧНУЮ, ТАК ЧТО СПАМИТЬ БЕСПОЛЕЗНО!