mirror of
https://github.com/vichan-devel/vichan.git
synced 2024-11-23 23:20:57 +01:00
tesseract OCR support for spamfilters
This commit is contained in:
parent
36d762514c
commit
8a46c7a0d5
@ -824,6 +824,15 @@
|
|||||||
// Set this to true if you're using Linux and you can execute `md5sum` binary.
|
// Set this to true if you're using Linux and you can execute `md5sum` binary.
|
||||||
$config['gnu_md5'] = false;
|
$config['gnu_md5'] = false;
|
||||||
|
|
||||||
|
// Use Tesseract OCR to retrieve text from images, so you can use it as a spamfilter.
|
||||||
|
$config['tesseract_ocr'] = false;
|
||||||
|
|
||||||
|
// Tesseract parameters
|
||||||
|
$config['tesseract_params'] = '';
|
||||||
|
|
||||||
|
// Tesseract preprocess command
|
||||||
|
$config['tesseract_preprocess_command'] = 'convert -monochrome %s -';
|
||||||
|
|
||||||
// Number of posts in a "View Last X Posts" page
|
// Number of posts in a "View Last X Posts" page
|
||||||
$config['noko50_count'] = 50;
|
$config['noko50_count'] = 50;
|
||||||
// Number of posts a thread needs before it gets a "View Last X Posts" page.
|
// Number of posts a thread needs before it gets a "View Last X Posts" page.
|
||||||
@ -1015,6 +1024,10 @@
|
|||||||
// Minify Javascript using http://code.google.com/p/minify/.
|
// Minify Javascript using http://code.google.com/p/minify/.
|
||||||
$config['minify_js'] = false;
|
$config['minify_js'] = false;
|
||||||
|
|
||||||
|
// Dispatch thumbnail loading and image configuration with JavaScript. It will need a certain javascript
|
||||||
|
// code to work.
|
||||||
|
$config['javascript_image_dispatch'] = false;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ====================
|
* ====================
|
||||||
* Video embedding
|
* Video embedding
|
||||||
|
@ -2695,7 +2695,7 @@ function slugify($post) {
|
|||||||
elseif (isset ($post['body_nomarkup']) && $post['body_nomarkup'])
|
elseif (isset ($post['body_nomarkup']) && $post['body_nomarkup'])
|
||||||
$slug = $post['body_nomarkup'];
|
$slug = $post['body_nomarkup'];
|
||||||
elseif (isset ($post['body']) && $post['body'])
|
elseif (isset ($post['body']) && $post['body'])
|
||||||
$slug = strip_html($post['body']);
|
$slug = strip_tags($post['body']);
|
||||||
|
|
||||||
// Fix UTF-8 first
|
// Fix UTF-8 first
|
||||||
$slug = mb_convert_encoding($slug, "UTF-8", "UTF-8");
|
$slug = mb_convert_encoding($slug, "UTF-8", "UTF-8");
|
||||||
|
33
post.php
33
post.php
@ -788,6 +788,34 @@ if (isset($_POST['delete'])) {
|
|||||||
$file['thumbheight'] = $size[1];
|
$file['thumbheight'] = $size[1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ($config['tesseract_ocr']) { // Let's OCR it!
|
||||||
|
$fname = $file['tmp_name'];
|
||||||
|
|
||||||
|
if ($file['height'] > 500 || $file['width'] > 500) {
|
||||||
|
$fname = $file['thumb'];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($fname == 'spoiler') { // We don't have that much CPU time, do we?
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
$tmpname = "tmp/tesseract/".rand(0,10000000);
|
||||||
|
|
||||||
|
// Preprocess command is an ImageMagick b/w quantization
|
||||||
|
$error = shell_exec_error(sprintf($config['tesseract_preprocess_command'], escapeshellarg($fname)) . " | " .
|
||||||
|
'tesseract stdin '.escapeshellarg($tmpname).' '.$config['tesseract_params']);
|
||||||
|
$tmpname .= ".txt";
|
||||||
|
|
||||||
|
$value = @file_get_contents($tmpname);
|
||||||
|
@unlink($tmpname);
|
||||||
|
|
||||||
|
if ($value && trim($value)) {
|
||||||
|
// This one has an effect, that the body is appended to a post body. So you can write a correct
|
||||||
|
// spamfilter.
|
||||||
|
$post['body_nomarkup'] .= "<tinyboard ocr image $key>".htmlspecialchars($value)."</tinyboard>";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!isset($dont_copy_file) || !$dont_copy_file) {
|
if (!isset($dont_copy_file) || !$dont_copy_file) {
|
||||||
if (isset($file['file_tmp'])) {
|
if (isset($file['file_tmp'])) {
|
||||||
if (!@rename($file['tmp_name'], $file['file']))
|
if (!@rename($file['tmp_name'], $file['file']))
|
||||||
@ -827,6 +855,11 @@ if (isset($_POST['delete'])) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Do filters again if OCRing
|
||||||
|
if ($config['tesseract_ocr'] && !hasPermission($config['mod']['bypass_filters'], $board['uri'])) {
|
||||||
|
do_filters($post);
|
||||||
|
}
|
||||||
|
|
||||||
if (!hasPermission($config['mod']['postunoriginal'], $board['uri']) && $config['robot_enable'] && checkRobot($post['body_nomarkup'])) {
|
if (!hasPermission($config['mod']['postunoriginal'], $board['uri']) && $config['robot_enable'] && checkRobot($post['body_nomarkup'])) {
|
||||||
undoImage($post);
|
undoImage($post);
|
||||||
if ($config['robot_mute']) {
|
if ($config['robot_mute']) {
|
||||||
|
0
tmp/tesseract/.gitkeep
Normal file
0
tmp/tesseract/.gitkeep
Normal file
Loading…
Reference in New Issue
Block a user