diff --git a/post.php b/post.php index 1aa1edf9..88ff9665 100644 --- a/post.php +++ b/post.php @@ -143,6 +143,28 @@ function download_file_from_url($file_url, $request_timeout, $allowed_extensions ); } +/** + * Try extract text from the given image. + * + * @param array $config Instance configuration. + * @param string $img_path The file path to the image. + * @return string|false Returns a string with the extracted text on success (if any). + * @throws RuntimeException Throws if executing tesseract fails. + */ +function ocr_image(array $config, string $img_path): string { + // The default preprocess command is an ImageMagick b/w quantization. + $ret = shell_exec_error( + sprintf($config['tesseract_preprocess_command'], escapeshellarg($img_path)) + . ' | tesseract stdin stdout 2>/dev/null' + . $config['tesseract_params'] + ); + if ($ret === false) { + throw new RuntimeException('Unable to run tesseract'); + } + + return trim($ret); +} + /** * Method handling functions */ @@ -1068,7 +1090,6 @@ if (isset($_POST['delete'])) { $image->destroy(); } else { // not an image - //copy($config['file_thumb'], $post['thumb']); $file['thumb'] = 'file'; $size = @getimagesize(sprintf($config['file_thumb'], @@ -1086,23 +1107,18 @@ if (isset($_POST['delete'])) { $fname = $file['thumb']; } - if ($fname == 'spoiler') { // We don't have that much CPU time, do we? - } - else { - $tmpname = "tmp/tesseract/".rand(0,10000000); - - // Preprocess command is an ImageMagick b/w quantization - $error = shell_exec_error(sprintf($config['tesseract_preprocess_command'], escapeshellarg($fname)) . " | " . - 'tesseract stdin '.escapeshellarg($tmpname).' '.$config['tesseract_params']); - $tmpname .= ".txt"; - - $value = @file_get_contents($tmpname); - @unlink($tmpname); - - if ($value && trim($value)) { - // This one has an effect, that the body is appended to a post body. So you can write a correct - // spamfilter. - $post['body_nomarkup'] .= "".htmlspecialchars($value).""; + if ($fname !== 'spoiler') { // We don't have that much CPU time, do we? + try { + $txt = ocr_image($config, $fname); + if ($txt !== '') { + // This one has an effect, that the body is appended to a post body. So you can write a correct + // spamfilter. + $post['body_nomarkup'] .= "" . htmlspecialchars($value) . ""; + } + } catch (RuntimeException $e) { + if ($config['syslog']) { + _syslog(LOG_ERR, "Could not OCR image: {$e->getMessage()}"); + } } } }