From 583c4aac0b5e2f606ba4dc5c5712e6e53a98ec04 Mon Sep 17 00:00:00 2001 From: Emmanuel Di Pretoro Date: Fri, 24 Aug 2018 11:20:54 +0200 Subject: [PATCH] Adding the text extraction of the PDF files found in data --- Makefile | 4 ++++ doc/help.txt | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/Makefile b/Makefile index 89f5e70..109435a 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,6 @@ help: @cat doc/help.txt + +pdftotext: + @find ./data -iname '*.pdf' -execdir pdftotext {} \; + @find ./data -not \( -path ./data/text -prune \) -iname '*.txt' -exec mv {} './data/text/' ';' diff --git a/doc/help.txt b/doc/help.txt index 508455a..6056996 100644 --- a/doc/help.txt +++ b/doc/help.txt @@ -1,2 +1,6 @@ With this command, you'll be able to manage easily the extraction of URLs from books scanned by KBR + +Here is the list of commands and what there are doing: +* make pdftotext: this command extract a text version of the PDF files + and copy these files to the data/text/ directory