1
0
mirror of https://github.com/vichan-devel/vichan.git synced 2025-02-17 11:28:41 +01:00

Merge pull request #686 from Zankaria/refactor-post-ocr

post.php: refactor image OCR into function
This commit is contained in:
Lorenzo Yario 2024-03-11 16:13:26 -07:00 committed by GitHub
commit 9aba8f35b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -143,6 +143,28 @@ function download_file_from_url($file_url, $request_timeout, $allowed_extensions
); );
} }
/**
* Try extract text from the given image.
*
* @param array $config Instance configuration.
* @param string $img_path The file path to the image.
* @return string|false Returns a string with the extracted text on success (if any).
* @throws RuntimeException Throws if executing tesseract fails.
*/
function ocr_image(array $config, string $img_path): string {
// The default preprocess command is an ImageMagick b/w quantization.
$ret = shell_exec_error(
sprintf($config['tesseract_preprocess_command'], escapeshellarg($img_path))
. ' | tesseract stdin stdout 2>/dev/null'
. $config['tesseract_params']
);
if ($ret === false) {
throw new RuntimeException('Unable to run tesseract');
}
return trim($ret);
}
/** /**
* Method handling functions * Method handling functions
*/ */
@ -1068,7 +1090,6 @@ if (isset($_POST['delete'])) {
$image->destroy(); $image->destroy();
} else { } else {
// not an image // not an image
//copy($config['file_thumb'], $post['thumb']);
$file['thumb'] = 'file'; $file['thumb'] = 'file';
$size = @getimagesize(sprintf($config['file_thumb'], $size = @getimagesize(sprintf($config['file_thumb'],
@ -1086,23 +1107,18 @@ if (isset($_POST['delete'])) {
$fname = $file['thumb']; $fname = $file['thumb'];
} }
if ($fname == 'spoiler') { // We don't have that much CPU time, do we? if ($fname !== 'spoiler') { // We don't have that much CPU time, do we?
} try {
else { $txt = ocr_image($config, $fname);
$tmpname = "tmp/tesseract/".rand(0,10000000); if ($txt !== '') {
// This one has an effect, that the body is appended to a post body. So you can write a correct
// Preprocess command is an ImageMagick b/w quantization // spamfilter.
$error = shell_exec_error(sprintf($config['tesseract_preprocess_command'], escapeshellarg($fname)) . " | " . $post['body_nomarkup'] .= "<tinyboard ocr image $key>" . htmlspecialchars($value) . "</tinyboard>";
'tesseract stdin '.escapeshellarg($tmpname).' '.$config['tesseract_params']); }
$tmpname .= ".txt"; } catch (RuntimeException $e) {
if ($config['syslog']) {
$value = @file_get_contents($tmpname); _syslog(LOG_ERR, "Could not OCR image: {$e->getMessage()}");
@unlink($tmpname); }
if ($value && trim($value)) {
// This one has an effect, that the body is appended to a post body. So you can write a correct
// spamfilter.
$post['body_nomarkup'] .= "<tinyboard ocr image $key>".htmlspecialchars($value)."</tinyboard>";
} }
} }
} }