Magic operation now detects UTF8 and gives a probability score for each language
This commit is contained in:
parent
865ee6a720
commit
6624f25a64
13
package-lock.json
generated
13
package-lock.json
generated
@ -1404,6 +1404,14 @@
|
|||||||
"supports-color": "2.0.0"
|
"supports-color": "2.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"chi-squared": {
|
||||||
|
"version": "1.1.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/chi-squared/-/chi-squared-1.1.0.tgz",
|
||||||
|
"integrity": "sha1-iShlz/qOCnIPkhv8nGNcGawqNG0=",
|
||||||
|
"requires": {
|
||||||
|
"gamma": "1.0.0"
|
||||||
|
}
|
||||||
|
},
|
||||||
"chokidar": {
|
"chokidar": {
|
||||||
"version": "1.7.0",
|
"version": "1.7.0",
|
||||||
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz",
|
"resolved": "https://registry.npmjs.org/chokidar/-/chokidar-1.7.0.tgz",
|
||||||
@ -4255,6 +4263,11 @@
|
|||||||
"integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=",
|
"integrity": "sha1-GwqzvVU7Kg1jmdKcDj6gslIHgyc=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"gamma": {
|
||||||
|
"version": "1.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/gamma/-/gamma-1.0.0.tgz",
|
||||||
|
"integrity": "sha1-mDwck5/iPZMnAVhXEeHZpDDLdMs="
|
||||||
|
},
|
||||||
"get-caller-file": {
|
"get-caller-file": {
|
||||||
"version": "1.0.2",
|
"version": "1.0.2",
|
||||||
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz",
|
"resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-1.0.2.tgz",
|
||||||
|
@ -72,6 +72,7 @@
|
|||||||
"bootstrap": "^3.3.7",
|
"bootstrap": "^3.3.7",
|
||||||
"bootstrap-colorpicker": "^2.5.2",
|
"bootstrap-colorpicker": "^2.5.2",
|
||||||
"bootstrap-switch": "^3.3.4",
|
"bootstrap-switch": "^3.3.4",
|
||||||
|
"chi-squared": "^1.1.0",
|
||||||
"crypto-api": "^0.7.5",
|
"crypto-api": "^0.7.5",
|
||||||
"crypto-js": "^3.1.9-1",
|
"crypto-js": "^3.1.9-1",
|
||||||
"diff": "^3.4.0",
|
"diff": "^3.4.0",
|
||||||
|
@ -278,8 +278,7 @@ const FlowControl = {
|
|||||||
<tr>
|
<tr>
|
||||||
<th>Recipe (click to load)</th>
|
<th>Recipe (click to load)</th>
|
||||||
<th>Data snippet</th>
|
<th>Data snippet</th>
|
||||||
<th>Most likely language\n(lower scores are better)</th>
|
<th>Properties</th>
|
||||||
<th>File type</th>
|
|
||||||
</tr>`;
|
</tr>`;
|
||||||
|
|
||||||
options.forEach(option => {
|
options.forEach(option => {
|
||||||
@ -290,20 +289,25 @@ const FlowControl = {
|
|||||||
.concat(currentRecipeConfig.slice(state.progress + 1)),
|
.concat(currentRecipeConfig.slice(state.progress + 1)),
|
||||||
recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig));
|
recipeURL = "recipe=" + Utils.encodeURIFragment(Utils.generatePrettyRecipe(recipeConfig));
|
||||||
|
|
||||||
const language = option.languageScores[0];
|
const bestLanguage = option.languageScores[0];
|
||||||
let fileType = "Unknown";
|
let language = "Unknown",
|
||||||
|
fileType = "Unknown";
|
||||||
|
|
||||||
|
if (bestLanguage.probability > 0.00005) {
|
||||||
|
language = Magic.codeToLanguage(bestLanguage.lang) + " " +
|
||||||
|
(bestLanguage.probability * 100).toFixed(2) + "%";
|
||||||
|
}
|
||||||
|
|
||||||
if (option.fileType) {
|
if (option.fileType) {
|
||||||
fileType = `Extension: ${option.fileType.ext}\nMime type: ${option.fileType.mime}`;
|
fileType = `${option.fileType.mime} (${option.fileType.ext})`;
|
||||||
if (option.fileType.desc)
|
|
||||||
fileType += `\nDescription: ${option.fileType.desc}`;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
output += `<tr>
|
output += `<tr>
|
||||||
<td><a href="#${recipeURL}">${Utils.generatePrettyRecipe(option.recipe, true)}</a></td>
|
<td><a href="#${recipeURL}">${Utils.generatePrettyRecipe(option.recipe, true)}</a></td>
|
||||||
<td>${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))}</td>
|
<td>${Utils.escapeHtml(Utils.printable(Utils.truncate(option.data, 99)))}</td>
|
||||||
<td>${Magic.codeToLanguage(language.lang)}\nScore: ${language.chiSqr.toFixed()}</td>
|
<td>Language: ${language}
|
||||||
<td>${fileType}</td>
|
File type: ${fileType}
|
||||||
|
Valid UTF8: ${option.isUTF8}</td>
|
||||||
</tr>`;
|
</tr>`;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ import Utils from "../Utils.js";
|
|||||||
import Recipe from "../Recipe.js";
|
import Recipe from "../Recipe.js";
|
||||||
import Dish from "../Dish.js";
|
import Dish from "../Dish.js";
|
||||||
import FileType from "../operations/FileType.js";
|
import FileType from "../operations/FileType.js";
|
||||||
|
import chiSquared from "chi-squared";
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -19,11 +20,12 @@ class Magic {
|
|||||||
* Magic constructor.
|
* Magic constructor.
|
||||||
*
|
*
|
||||||
* @param {ArrayBuffer} buf
|
* @param {ArrayBuffer} buf
|
||||||
|
* @param {Object[]} [opPatterns]
|
||||||
*/
|
*/
|
||||||
constructor(buf) {
|
constructor(buf, opPatterns) {
|
||||||
this.inputBuffer = new Uint8Array(buf);
|
this.inputBuffer = new Uint8Array(buf);
|
||||||
this.inputStr = Utils.arrayBufferToStr(buf);
|
this.inputStr = Utils.arrayBufferToStr(buf);
|
||||||
this.opPatterns = Magic._generateOpPatterns();
|
this.opPatterns = opPatterns || Magic._generateOpPatterns();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -58,15 +60,17 @@ class Magic {
|
|||||||
let chiSqrs = [];
|
let chiSqrs = [];
|
||||||
|
|
||||||
for (let lang in LANG_FREQS) {
|
for (let lang in LANG_FREQS) {
|
||||||
|
let [score, prob] = Magic._chiSqr(inputFreq, LANG_FREQS[lang]);
|
||||||
chiSqrs.push({
|
chiSqrs.push({
|
||||||
lang: lang,
|
lang: lang,
|
||||||
chiSqr: Magic._chiSqr(inputFreq, LANG_FREQS[lang])
|
score: score,
|
||||||
|
probability: prob
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort results so that the most likely match is at the top
|
// Sort results so that the most likely match is at the top
|
||||||
chiSqrs.sort((a, b) => {
|
chiSqrs.sort((a, b) => {
|
||||||
return a.chiSqr - b.chiSqr;
|
return a.score - b.score;
|
||||||
});
|
});
|
||||||
|
|
||||||
return chiSqrs;
|
return chiSqrs;
|
||||||
@ -84,6 +88,81 @@ class Magic {
|
|||||||
return FileType.magicType(this.inputBuffer);
|
return FileType.magicType(this.inputBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detects whether the input buffer is valid UTF8.
|
||||||
|
*
|
||||||
|
* @returns {boolean}
|
||||||
|
*/
|
||||||
|
isUTF8() {
|
||||||
|
const bytes = new Uint8Array(this.inputBuffer);
|
||||||
|
let i = 0;
|
||||||
|
while (i < bytes.length) {
|
||||||
|
if (( // ASCII
|
||||||
|
bytes[i] === 0x09 ||
|
||||||
|
bytes[i] === 0x0A ||
|
||||||
|
bytes[i] === 0x0D ||
|
||||||
|
(0x20 <= bytes[i] && bytes[i] <= 0x7E)
|
||||||
|
)) {
|
||||||
|
i += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( // non-overlong 2-byte
|
||||||
|
(0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
|
||||||
|
(0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
|
||||||
|
)) {
|
||||||
|
i += 2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( // excluding overlongs
|
||||||
|
bytes[i] === 0xE0 &&
|
||||||
|
(0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
( // straight 3-byte
|
||||||
|
((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
|
||||||
|
bytes[i] === 0xEE ||
|
||||||
|
bytes[i] === 0xEF) &&
|
||||||
|
(0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
( // excluding surrogates
|
||||||
|
bytes[i] === 0xED &&
|
||||||
|
(0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
|
||||||
|
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
|
||||||
|
)) {
|
||||||
|
i += 3;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( // planes 1-3
|
||||||
|
bytes[i] === 0xF0 &&
|
||||||
|
(0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
( // planes 4-15
|
||||||
|
(0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
|
||||||
|
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
( // plane 16
|
||||||
|
bytes[i] === 0xF4 &&
|
||||||
|
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
|
||||||
|
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
|
||||||
|
)) {
|
||||||
|
i += 4;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Speculatively executes matching operations, recording metadata of each result.
|
* Speculatively executes matching operations, recording metadata of each result.
|
||||||
@ -103,6 +182,7 @@ class Magic {
|
|||||||
data: this.inputStr.slice(0, 100),
|
data: this.inputStr.slice(0, 100),
|
||||||
languageScores: this.detectLanguage(),
|
languageScores: this.detectLanguage(),
|
||||||
fileType: this.detectFileType(),
|
fileType: this.detectFileType(),
|
||||||
|
isUTF8: this.isUTF8()
|
||||||
});
|
});
|
||||||
|
|
||||||
// Find any operations that can be run on this data
|
// Find any operations that can be run on this data
|
||||||
@ -122,7 +202,7 @@ class Magic {
|
|||||||
const recipe = new Recipe([opConfig]);
|
const recipe = new Recipe([opConfig]);
|
||||||
await recipe.execute(dish, 0);
|
await recipe.execute(dish, 0);
|
||||||
|
|
||||||
const magic = new Magic(dish.get(Dish.ARRAY_BUFFER)),
|
const magic = new Magic(dish.get(Dish.ARRAY_BUFFER), this.opPatterns),
|
||||||
speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]);
|
speculativeResults = await magic.speculativeExecution(depth-1, [...recipeConfig, opConfig]);
|
||||||
|
|
||||||
results = results.concat(speculativeResults);
|
results = results.concat(speculativeResults);
|
||||||
@ -131,13 +211,17 @@ class Magic {
|
|||||||
// Return a sorted list of possible recipes along with their properties
|
// Return a sorted list of possible recipes along with their properties
|
||||||
return results.sort((a, b) => {
|
return results.sort((a, b) => {
|
||||||
// Each option is sorted based on its most likely language (lower is better)
|
// Each option is sorted based on its most likely language (lower is better)
|
||||||
let aScore = a.languageScores[0].chiSqr,
|
let aScore = a.languageScores[0].score,
|
||||||
bScore = b.languageScores[0].chiSqr;
|
bScore = b.languageScores[0].score;
|
||||||
|
|
||||||
// If a recipe results in a file being detected, it receives a relatively good score
|
// If a recipe results in a file being detected, it receives a relatively good score
|
||||||
if (a.fileType) aScore = 500;
|
if (a.fileType) aScore = 500;
|
||||||
if (b.fileType) bScore = 500;
|
if (b.fileType) bScore = 500;
|
||||||
|
|
||||||
|
// If the result is valid UTF8, its score gets boosted (lower being better)
|
||||||
|
if (a.isUTF8) aScore -= 100;
|
||||||
|
if (b.isUTF8) bScore -= 100;
|
||||||
|
|
||||||
return aScore - bScore;
|
return aScore - bScore;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@ -196,17 +280,22 @@ class Magic {
|
|||||||
* @private
|
* @private
|
||||||
* @param {number[]} observed
|
* @param {number[]} observed
|
||||||
* @param {number[]} expected
|
* @param {number[]} expected
|
||||||
* @returns {number}
|
* @param {number} ddof - Delta degrees of freedom
|
||||||
|
* @returns {number[]} - The score and the probability
|
||||||
*/
|
*/
|
||||||
static _chiSqr(observed, expected) {
|
static _chiSqr(observed, expected, ddof=0) {
|
||||||
let tmp,
|
let tmp,
|
||||||
res = 0;
|
score = 0;
|
||||||
|
|
||||||
for (let i = 0; i < observed.length; i++) {
|
for (let i = 0; i < observed.length; i++) {
|
||||||
tmp = observed[i] - expected[i];
|
tmp = observed[i] - expected[i];
|
||||||
res += tmp * tmp / expected[i];
|
score += tmp * tmp / expected[i];
|
||||||
}
|
}
|
||||||
return res;
|
|
||||||
|
return [
|
||||||
|
score,
|
||||||
|
1 - chiSquared.cdf(score, observed.length - 1 - ddof)
|
||||||
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Loading…
Reference in New Issue
Block a user