diff --git a/pom.xml b/pom.xml index 52adb1c..0fb13ab 100644 --- a/pom.xml +++ b/pom.xml @@ -14,6 +14,26 @@ pdfbox 2.0.29 + + org.apache.poi + poi-ooxml + 5.2.3 + + + org.apache.poi + poi + 5.2.3 + + + org.apache.xmlbeans + xmlbeans + 5.1.1 + + + org.apache.logging.log4j + log4j-core + 2.18.0 + diff --git a/src/main/java/domain/FileLoader.java b/src/main/java/domain/FileLoader.java index d98f24c..9b0b100 100644 --- a/src/main/java/domain/FileLoader.java +++ b/src/main/java/domain/FileLoader.java @@ -14,14 +14,14 @@ public class FileLoader { public FileLoader() { this.inputFile = null; } - + //KI erstellte Methode public File loadFileGUI() { try { JFileChooser fileChooser = new JFileChooser(); fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("PDF Files", "pdf")); - fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Text Files", "txt")); - fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Word Documents", "docx")); - fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("PowerPoint Presentations", "pptx")); + fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Text Files", "txt")); //selbst hinzugefügt + fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("Word Documents", "docx")); //selbst hinzugefügt + fileChooser.addChoosableFileFilter(new FileNameExtensionFilter("PowerPoint Presentations", "pptx")); //selbst hinzugefügt int result = fileChooser.showOpenDialog(null); if (result == JFileChooser.APPROVE_OPTION) { diff --git a/src/main/java/domain/TextProcessing.java b/src/main/java/domain/TextProcessing.java index ea4b6d5..c6b2e20 100644 --- a/src/main/java/domain/TextProcessing.java +++ b/src/main/java/domain/TextProcessing.java @@ -2,6 +2,12 @@ package domain; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.apache.poi.xwpf.usermodel.XWPFParagraph; +import org.apache.poi.xslf.usermodel.XMLSlideShow; +import org.apache.poi.xslf.usermodel.XSLFSlide; +import org.apache.poi.xslf.usermodel.XSLFShape; +import org.apache.poi.xslf.usermodel.XSLFTextShape; import java.io.*; import java.util.HashMap; @@ -9,31 +15,44 @@ import java.util.HashMap; public class TextProcessing { public String formatToText(File file, String format) { - PDDocument document; try { + StringBuilder text = new StringBuilder(); if (file != null) { switch (format) { case "txt": - - break; - + FileReader fileReader = new FileReader(file); + BufferedReader reader = new BufferedReader(fileReader); + String line; + while((line = reader.readLine()) != null) { + text.append(line).append("\n"); + } + return text.toString(); case "pdf": - document = PDDocument.load(file); + PDDocument document = PDDocument.load(file); PDFTextStripper pdfStripper = new PDFTextStripper(); return pdfStripper.getText(document); case "docx": - - break; - + XWPFDocument officeDocument = new XWPFDocument(new FileInputStream(file)); + for(XWPFParagraph paragraph : officeDocument.getParagraphs()) { + text.append(paragraph.getText()).append("\n"); + } + return text.toString(); case "pptx": - - break; + XMLSlideShow ppt = new XMLSlideShow(new FileInputStream(file)); + for (XSLFSlide slide : ppt.getSlides()) { + for (XSLFShape shape : slide.getShapes()) { + if (shape instanceof XSLFTextShape) { + text.append(((XSLFTextShape) shape).getText()).append("\n"); + } + } + } + return text.toString(); } } } - catch (IOException ex) { - throw new RuntimeException(ex); + catch (IOException e) { + throw new RuntimeException(e); } return "Nothing found!"; } @@ -53,18 +72,20 @@ public class TextProcessing { public HashMap tokenizingText(String text){ HashMap filteredWords = new HashMap<>(); try { + if(!text.isEmpty()) { //Tokenizing der Wörter - String splitter = "[,\\s\\.:/!§$%&/()=?+*~#.;_<>^°\"']"; + String splitter = "[,\\s\\.:/!§$%&/()=?+*~#.;_<\\-–>^°\"']"; String[] textWords = text.split(splitter); - for(String word : textWords){ + for (String word : textWords) { if (filteredWords.containsKey(word)) { filteredWords.compute(word, (k, counter) -> counter + 1); - } - else { + } else { filteredWords.put(word, 1); } } - } catch (Exception ex) { + } + } + catch (Exception ex) { throw new RuntimeException(ex); } return filteredWords;