From 583c4aac0b5e2f606ba4dc5c5712e6e53a98ec04 Mon Sep 17 00:00:00 2001
From: Emmanuel Di Pretoro <edipretoro@gmail.com>
Date: Fri, 24 Aug 2018 11:20:54 +0200
Subject: [PATCH] Adding the text extraction of the PDF files found in data

---
 Makefile     | 4 ++++
 doc/help.txt | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/Makefile b/Makefile
index 89f5e70..109435a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,2 +1,6 @@
 help:
 	@cat doc/help.txt
+
+pdftotext:
+	@find ./data -iname '*.pdf' -execdir pdftotext {} \;
+	@find ./data -not \( -path ./data/text -prune \) -iname '*.txt' -exec mv {} './data/text/' ';'
diff --git a/doc/help.txt b/doc/help.txt
index 508455a..6056996 100644
--- a/doc/help.txt
+++ b/doc/help.txt
@@ -1,2 +1,6 @@
 With this command, you'll be able to manage easily the extraction of
 URLs from books scanned by KBR
+
+Here is the list of commands and what there are doing:
+* make pdftotext: this command extract a text version of the PDF files
+                  and copy these files to the data/text/ directory