diff --git a/prototypes/ocr_demo/logs/Pitchbook 1.log b/prototypes/ocr_demo/logs/Pitchbook 1.log new file mode 100644 index 0000000..742ea38 --- /dev/null +++ b/prototypes/ocr_demo/logs/Pitchbook 1.log @@ -0,0 +1,60 @@ + +Start processing 10 pages concurrently + 2 page already has text! - rasterizing text and running OCR anyway + 3 page already has text! - rasterizing text and running OCR anyway + 4 page already has text! - rasterizing text and running OCR anyway + 5 page already has text! - rasterizing text and running OCR anyway + 6 page already has text! - rasterizing text and running OCR anyway + 7 page already has text! - rasterizing text and running OCR anyway + 8 page already has text! - rasterizing text and running OCR anyway + 9 page already has text! - rasterizing text and running OCR anyway + 10 page already has text! - rasterizing text and running OCR anyway + 11 page already has text! - rasterizing text and running OCR anyway + 12 page already has text! - rasterizing text and running OCR anyway + 13 page already has text! - rasterizing text and running OCR anyway + 14 page already has text! - rasterizing text and running OCR anyway + 15 page already has text! - rasterizing text and running OCR anyway + 16 page already has text! - rasterizing text and running OCR anyway + 17 page already has text! - rasterizing text and running OCR anyway + 18 page already has text! - rasterizing text and running OCR anyway + 11 [tesseract] lots of diacritics - possibly poor OCR + 19 page already has text! - rasterizing text and running OCR anyway + 20 page already has text! - rasterizing text and running OCR anyway + 21 page already has text! - rasterizing text and running OCR anyway + 22 page already has text! - rasterizing text and running OCR anyway + 23 page already has text! - rasterizing text and running OCR anyway + 24 page already has text! - rasterizing text and running OCR anyway + 25 page already has text! - rasterizing text and running OCR anyway + 26 page already has text! - rasterizing text and running OCR anyway + 27 page already has text! - rasterizing text and running OCR anyway + 28 page already has text! - rasterizing text and running OCR anyway + 29 page already has text! - rasterizing text and running OCR anyway + 30 page already has text! - rasterizing text and running OCR anyway + 31 page already has text! - rasterizing text and running OCR anyway + 20 [tesseract] lots of diacritics - possibly poor OCR + 32 page already has text! - rasterizing text and running OCR anyway + 33 page already has text! - rasterizing text and running OCR anyway + 34 page already has text! - rasterizing text and running OCR anyway + 35 page already has text! - rasterizing text and running OCR anyway + 36 page already has text! - rasterizing text and running OCR anyway + 26 [tesseract] lots of diacritics - possibly poor OCR + 37 page already has text! - rasterizing text and running OCR anyway + 31 [tesseract] lots of diacritics - possibly poor OCR + 38 page already has text! - rasterizing text and running OCR anyway + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 1.64 savings: 39.0% +Total file size ratio: 0.59 savings: -68.4% +Output file is a PDF/A-2B (as expected) +The output file size is 1.68× larger than the input file. +Possible reasons for this include: +--force-ocr was issued, causing transcoding. +The optional dependency 'jbig2' was not found, so some image optimizations could not be attempted. +The optional dependency 'pngquant' was not found, so some image optimizations could not be attempted. +PDF/A conversion was enabled. (Try `--output-type pdf`.) + diff --git a/prototypes/ocr_demo/logs/Pitchbook 2.log b/prototypes/ocr_demo/logs/Pitchbook 2.log new file mode 100644 index 0000000..1bc8e24 --- /dev/null +++ b/prototypes/ocr_demo/logs/Pitchbook 2.log @@ -0,0 +1,17 @@ + +Start processing 10 pages concurrently + 12 [tesseract] lots of diacritics - possibly poor OCR + 15 [tesseract] lots of diacritics - possibly poor OCR + 37 [tesseract] lots of diacritics - possibly poor OCR + 47 [tesseract] lots of diacritics - possibly poor OCR + 49 [tesseract] lots of diacritics - possibly poor OCR + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 1.26 savings: 20.7% +Total file size ratio: 0.91 savings: -9.6% +Output file is a PDF/A-2B (as expected) diff --git a/prototypes/ocr_demo/logs/Pitchbook 3.log b/prototypes/ocr_demo/logs/Pitchbook 3.log new file mode 100644 index 0000000..ca14c18 --- /dev/null +++ b/prototypes/ocr_demo/logs/Pitchbook 3.log @@ -0,0 +1,13 @@ + +Start processing 10 pages concurrently + 21 [tesseract] lots of diacritics - possibly poor OCR + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 1.21 savings: 17.6% +Total file size ratio: 0.97 savings: -3.3% +Output file is a PDF/A-2B (as expected) diff --git a/prototypes/ocr_demo/logs/Teaser 1 FINAL.log b/prototypes/ocr_demo/logs/Teaser 1 FINAL.log new file mode 100644 index 0000000..79bbf58 --- /dev/null +++ b/prototypes/ocr_demo/logs/Teaser 1 FINAL.log @@ -0,0 +1,53 @@ + +Start processing 10 pages concurrently + 1 page already has text! - rasterizing text and running OCR anyway + 2 page already has text! - rasterizing text and running OCR anyway + 3 page already has text! - rasterizing text and running OCR anyway + 4 page already has text! - rasterizing text and running OCR anyway + 5 page already has text! - rasterizing text and running OCR anyway + 6 page already has text! - rasterizing text and running OCR anyway + 7 page already has text! - rasterizing text and running OCR anyway + 8 page already has text! - rasterizing text and running OCR anyway + 9 page already has text! - rasterizing text and running OCR anyway + 10 page already has text! - rasterizing text and running OCR anyway + 11 page already has text! - rasterizing text and running OCR anyway + 12 page already has text! - rasterizing text and running OCR anyway + 13 page already has text! - rasterizing text and running OCR anyway + 14 page already has text! - rasterizing text and running OCR anyway + 15 page already has text! - rasterizing text and running OCR anyway + 16 page already has text! - rasterizing text and running OCR anyway + 17 page already has text! - rasterizing text and running OCR anyway + 18 page already has text! - rasterizing text and running OCR anyway + 19 page already has text! - rasterizing text and running OCR anyway + 20 page already has text! - rasterizing text and running OCR anyway + 1 [tesseract] lots of diacritics - possibly poor OCR + 21 page already has text! - rasterizing text and running OCR anyway + 22 page already has text! - rasterizing text and running OCR anyway + 23 page already has text! - rasterizing text and running OCR anyway + 24 page already has text! - rasterizing text and running OCR anyway + 25 page already has text! - rasterizing text and running OCR anyway + 26 page already has text! - rasterizing text and running OCR anyway + 27 page already has text! - rasterizing text and running OCR anyway + 28 page already has text! - rasterizing text and running OCR anyway + 29 page already has text! - rasterizing text and running OCR anyway + 30 page already has text! - rasterizing text and running OCR anyway + 31 page already has text! - rasterizing text and running OCR anyway + 32 page already has text! - rasterizing text and running OCR anyway + 26 [tesseract] lots of diacritics - possibly poor OCR + 33 page already has text! - rasterizing text and running OCR anyway + 21 [tesseract] lots of diacritics - possibly poor OCR + 35 page already has text! - rasterizing text and running OCR anyway + 36 page already has text! - rasterizing text and running OCR anyway + 37 page already has text! - rasterizing text and running OCR anyway + 38 page already has text! - rasterizing text and running OCR anyway + 38 [tesseract] lots of diacritics - possibly poor OCR + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 1.62 savings: 38.4% +Total file size ratio: 0.83 savings: -20.3% +Output file is a PDF/A-2B (as expected) diff --git a/prototypes/ocr_demo/logs/Teaser 2 FINAL.log b/prototypes/ocr_demo/logs/Teaser 2 FINAL.log new file mode 100644 index 0000000..4309d31 --- /dev/null +++ b/prototypes/ocr_demo/logs/Teaser 2 FINAL.log @@ -0,0 +1,48 @@ + +Start processing 10 pages concurrently + 2 page already has text! - rasterizing text and running OCR anyway + 3 page already has text! - rasterizing text and running OCR anyway + 4 page already has text! - rasterizing text and running OCR anyway + 5 page already has text! - rasterizing text and running OCR anyway + 6 page already has text! - rasterizing text and running OCR anyway + 7 page already has text! - rasterizing text and running OCR anyway + 8 page already has text! - rasterizing text and running OCR anyway + 9 page already has text! - rasterizing text and running OCR anyway + 10 page already has text! - rasterizing text and running OCR anyway + 11 page already has text! - rasterizing text and running OCR anyway + 12 page already has text! - rasterizing text and running OCR anyway + 13 page already has text! - rasterizing text and running OCR anyway + 14 page already has text! - rasterizing text and running OCR anyway + 15 page already has text! - rasterizing text and running OCR anyway + 16 page already has text! - rasterizing text and running OCR anyway + 17 page already has text! - rasterizing text and running OCR anyway + 18 page already has text! - rasterizing text and running OCR anyway + 19 page already has text! - rasterizing text and running OCR anyway + 20 page already has text! - rasterizing text and running OCR anyway + 21 page already has text! - rasterizing text and running OCR anyway + 22 page already has text! - rasterizing text and running OCR anyway + 23 page already has text! - rasterizing text and running OCR anyway + 24 page already has text! - rasterizing text and running OCR anyway + 25 page already has text! - rasterizing text and running OCR anyway + 26 page already has text! - rasterizing text and running OCR anyway + 27 page already has text! - rasterizing text and running OCR anyway + 28 page already has text! - rasterizing text and running OCR anyway + 29 page already has text! - rasterizing text and running OCR anyway + 30 page already has text! - rasterizing text and running OCR anyway + 31 page already has text! - rasterizing text and running OCR anyway + 32 page already has text! - rasterizing text and running OCR anyway + 33 page already has text! - rasterizing text and running OCR anyway + 34 page already has text! - rasterizing text and running OCR anyway + 35 page already has text! - rasterizing text and running OCR anyway + 36 page already has text! - rasterizing text and running OCR anyway + 37 page already has text! - rasterizing text and running OCR anyway + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 1.54 savings: 35.2% +Total file size ratio: 0.89 savings: -12.2% +Output file is a PDF/A-2B (as expected) diff --git a/prototypes/ocr_demo/logs/Teaser 3 FINAL.log b/prototypes/ocr_demo/logs/Teaser 3 FINAL.log new file mode 100644 index 0000000..c4c7977 --- /dev/null +++ b/prototypes/ocr_demo/logs/Teaser 3 FINAL.log @@ -0,0 +1,12 @@ + +Start processing 10 pages concurrently + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 2.13 savings: 53.0% +Total file size ratio: 2.83 savings: 64.7% +Output file is a PDF/A-2B (as expected) diff --git a/prototypes/ocr_demo/logs/Teaser 4 FINAL.log b/prototypes/ocr_demo/logs/Teaser 4 FINAL.log new file mode 100644 index 0000000..ced6b1b --- /dev/null +++ b/prototypes/ocr_demo/logs/Teaser 4 FINAL.log @@ -0,0 +1,14 @@ + +Start processing 10 pages concurrently + 27 [tesseract] lots of diacritics - possibly poor OCR + 45 [tesseract] lots of diacritics - possibly poor OCR + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 1.68 savings: 40.3% +Total file size ratio: 2.30 savings: 56.5% +Output file is a PDF/A-2B (as expected) diff --git a/prototypes/ocr_demo/logs/Teaser 5 FINAL.log b/prototypes/ocr_demo/logs/Teaser 5 FINAL.log new file mode 100644 index 0000000..f3f5321 --- /dev/null +++ b/prototypes/ocr_demo/logs/Teaser 5 FINAL.log @@ -0,0 +1,12 @@ + +Start processing 10 pages concurrently + +Postprocessing... +Some input metadata could not be copied because it is not permitted in PDF/A. You may wish to examine the output PDF's XMP metadata. + + + + +Image optimization ratio: 2.03 savings: 50.7% +Total file size ratio: 3.15 savings: 68.2% +Output file is a PDF/A-2B (as expected) diff --git a/prototypes/ocr_demo/ocr.py b/prototypes/ocr_demo/ocr.py new file mode 100644 index 0000000..436bcf7 --- /dev/null +++ b/prototypes/ocr_demo/ocr.py @@ -0,0 +1,43 @@ +import os +import subprocess +from pathlib import Path + +input_folder = Path("../../pitch-books") +output_folder = Path("output") +log_folder = Path("logs") + +for folder in [output_folder, log_folder]: + folder.mkdir(parents=True, exist_ok=True) + +def ocr_pdf(input_file: Path): + output_file = output_folder / f"{input_file.stem}-OCR.pdf" + log_file = log_folder / f"{input_file.stem}.log" + + cmd = [ + "ocrmypdf", + "--force-ocr", + "--output-type", "pdfa", + "--language", "deu+eng", + str(input_file), + str(output_file) + ] + + with open(log_file, "w") as log: + result = subprocess.run(cmd, stdout=log, stderr=log) + + if result.returncode == 0: + print(f"✅ OCR complete: {output_file.name}") + else: + print(f"❌ OCR failed. See log: {log_file}") + +if __name__ == "__main__": + if not input_folder.exists(): + print("Input folder does not exist!") + else: + pdfs = list(input_folder.glob("*.pdf")) + if not pdfs: + print("No PDFs found in input folder.") + else: + for pdf in pdfs: + print(f"Processing: {pdf.name}") + ocr_pdf(pdf) \ No newline at end of file diff --git a/prototypes/ocr_demo/output/Pitchbook 1-OCR.pdf b/prototypes/ocr_demo/output/Pitchbook 1-OCR.pdf new file mode 100644 index 0000000..09609d8 Binary files /dev/null and b/prototypes/ocr_demo/output/Pitchbook 1-OCR.pdf differ diff --git a/prototypes/ocr_demo/output/Pitchbook 2-OCR.pdf b/prototypes/ocr_demo/output/Pitchbook 2-OCR.pdf new file mode 100644 index 0000000..12d5b9d Binary files /dev/null and b/prototypes/ocr_demo/output/Pitchbook 2-OCR.pdf differ diff --git a/prototypes/ocr_demo/output/Pitchbook 3-OCR.pdf b/prototypes/ocr_demo/output/Pitchbook 3-OCR.pdf new file mode 100644 index 0000000..6d572f3 Binary files /dev/null and b/prototypes/ocr_demo/output/Pitchbook 3-OCR.pdf differ diff --git a/prototypes/ocr_demo/output/Teaser 1 FINAL-OCR.pdf b/prototypes/ocr_demo/output/Teaser 1 FINAL-OCR.pdf new file mode 100644 index 0000000..85d52a0 Binary files /dev/null and b/prototypes/ocr_demo/output/Teaser 1 FINAL-OCR.pdf differ diff --git a/prototypes/ocr_demo/output/Teaser 2 FINAL-OCR.pdf b/prototypes/ocr_demo/output/Teaser 2 FINAL-OCR.pdf new file mode 100644 index 0000000..32f808f Binary files /dev/null and b/prototypes/ocr_demo/output/Teaser 2 FINAL-OCR.pdf differ diff --git a/prototypes/ocr_demo/output/Teaser 3 FINAL-OCR.pdf b/prototypes/ocr_demo/output/Teaser 3 FINAL-OCR.pdf new file mode 100644 index 0000000..00f6037 Binary files /dev/null and b/prototypes/ocr_demo/output/Teaser 3 FINAL-OCR.pdf differ diff --git a/prototypes/ocr_demo/output/Teaser 4 FINAL-OCR.pdf b/prototypes/ocr_demo/output/Teaser 4 FINAL-OCR.pdf new file mode 100644 index 0000000..6aef376 Binary files /dev/null and b/prototypes/ocr_demo/output/Teaser 4 FINAL-OCR.pdf differ diff --git a/prototypes/ocr_demo/output/Teaser 5 FINAL-OCR.pdf b/prototypes/ocr_demo/output/Teaser 5 FINAL-OCR.pdf new file mode 100644 index 0000000..67a9dbc Binary files /dev/null and b/prototypes/ocr_demo/output/Teaser 5 FINAL-OCR.pdf differ diff --git a/prototypes/ocr_demo/requirements.txt b/prototypes/ocr_demo/requirements.txt new file mode 100644 index 0000000..9b84fc4 --- /dev/null +++ b/prototypes/ocr_demo/requirements.txt @@ -0,0 +1,3 @@ +ocrmypdf +pdfplumber +PyMuPDF \ No newline at end of file