From 0d09a825e9d1d5b4dd05a19a7d81da7897036fa2 Mon Sep 17 00:00:00 2001 From: Jaronim Pracht Date: Wed, 25 Jun 2025 15:36:01 +0200 Subject: [PATCH 1/7] add validiation in service --- .../validate-service/validate_logic.py | 108 ++++++++++++++++-- 1 file changed, 100 insertions(+), 8 deletions(-) diff --git a/project/backend/validate-service/validate_logic.py b/project/backend/validate-service/validate_logic.py index 1bbacbe..1e90d84 100644 --- a/project/backend/validate-service/validate_logic.py +++ b/project/backend/validate-service/validate_logic.py @@ -1,15 +1,36 @@ from typing import Dict, List +import re +import requests +import os - +# SETTINGS = [{"id": "Rendite", "type": "number"}] +COORDINATOR_URL = os.getenv("COORDINATOR_URL", "http://localhost:5000") def validate_entities(entities): + try: + response = requests.get(COORDINATOR_URL + "/api/kpi_setting/") + if response.status_code == 200: + settings = response.json() + else: + settings = [] + except requests.exceptions.RequestException as e: + print(f"Error fetching settings: {e}") + settings = [] + # settings = SETTINGS + result = [] - reduced_kpi: Dict[str, List[Dict[str, str | int]]] = {} + reduced_kpi: Dict[str, List[Dict[str, str]]] = {} + + # reduce entities by label. Example: {"PERSON": [{"label": "PERSON", "entity": "John Doe", "status": "validated"}]} for item in entities: label = item["label"] if label not in reduced_kpi: reduced_kpi[label] = [] reduced_kpi[label].append(item) + reduced_kpi = delete_exxeta_unknown(reduced_kpi) + reduced_kpi = validate_number(reduced_kpi, settings) + reduced_kpi = delete_duplicate_entities(reduced_kpi) + for item in reduced_kpi.items(): if item[0] == "FONDSNAME": result.extend(item[1]) @@ -21,6 +42,8 @@ def validate_entities(entities): result.extend(item[1]) continue + + # Filter not validated, if there are valid values validated = False for entity in item[1]: if entity["status"] == "validated": @@ -34,13 +57,82 @@ def validate_entities(entities): return result + +def validate_number(entity_list, settings): + filtered_kpi = {} + for label, entity_list in entity_list.items(): + + setting = next((s for s in settings if s["name"].upper() == label), None) + if setting and setting["type"] == "number": + filtered_entities = [ + entity for entity in entity_list + if is_valid_number(str(entity["entity"])) + ] + for entity in entity_list: + if not is_valid_number(str(entity["entity"])): + print(f"Invalid number: {entity}") + if filtered_entities: # Only add the label if there are entities left + filtered_kpi[label] = filtered_entities + else: + filtered_kpi[label] = entity_list + + return filtered_kpi + + +def is_valid_number(number): + pattern = r'^[0-9\-\s%,.€]+$' + return any(char.isdigit() for char in number) and not re.search(r'\d+\s\d+', number) and re.fullmatch(pattern, number) + + +def delete_exxeta_unknown(entity_list): + filtered_kpi = {} + for label, entity_list in entity_list.items(): + # Filter out entities with "nichtangegeben" or "n/a" (case-insensitive and stripped) + filtered_entities = [ + entity for entity in entity_list + if str(entity["entity"]).lower().replace(" ", "") not in {"nichtangegeben", "n/a"} + ] + for entity in entity_list: + if str(entity["entity"]).lower().replace(" ", "") in {"nichtangegeben", "n/a"}: + print(f"filtered out: {entity}") + if filtered_entities: # Only add the label if there are entities left + filtered_kpi[label] = filtered_entities + return filtered_kpi + + +def delete_duplicate_entities(entity_list): + unique_entities = {} + for label, entity_list in entity_list.items(): + values = set() + filtered_entities = [] + for entity in entity_list: + if str(entity["entity"]).lower().replace(" ", "") not in values: + filtered_entities.append(entity) + else: + print(f"Duplicate entity: {entity}") + values.add(str(entity["entity"]).lower().replace(" ", "")) + if filtered_entities: + unique_entities[label] = filtered_entities + return unique_entities + if __name__ == "__main__": entities = [ - {"label": "PERSON", "entity": "John Doe", "status": "validated"}, - {"label": "PERSON", "entity": "Exxeta", "status": "invalid"}, - {"label": "ORG", "entity": "Google", "status": "invalid"}, - {"label": "FONDSNAME", "entity": "Microsoft", "status": "validated"}, - {"label": "FONDSNAME", "entity": "Amazon", "status": "invalid"}, - {"label": "FONDSNAME", "entity": "Apple", "status": "invalid"} + # {"label": "PERSON", "entity": "John Doe", "status": "validated"}, + # {"label": "PERSON", "entity": "Exxeta", "status": "invalid"}, + # {"label": "ORG", "entity": "Google", "status": "invalid"}, + # {"label": "FONDSNAME", "entity": "Microsoft", "status": "validated"}, + # {"label": "FONDSNAME", "entity": "Amazon", "status": "invalid"}, + # {"label": "FONDSNAME", "entity": "Apple", "status": "invalid"}, + {"label": "RENDITE", "entity": "8 8 8 8 8", "status": "validated"}, + {"label": "RENDITE", "entity": "N/A", "status": "validated"}, + {"label": "RENDITE", "entity": "nicht angegeben", "status": "validated"}, + {"label": "RENDITE", "entity": "uaieluae--t>", "status": "validated"}, + {"label": "RENDITE", "entity": "3,5", "status": "validated"}, + {"label": "RENDITE", "entity": "3,5", "status": "validated"}, + {"label": "RENDITE", "entity": "3 , 5", "status": "validated"}, + {"label": "RENDITE", "entity": "3%", "status": "validated"}, + {"label": "RENDITE", "entity": "", "status": "invalid"}, + {"label": "RENDITE", "entity": "2 mehr als 6", "status": "invalid"}, + {"label": "RENDITE", "entity": 2, "status": "invalid"}, ] print(validate_entities(entities)) From 360d7f490694b734689317025c6337c13b86c049 Mon Sep 17 00:00:00 2001 From: Abdulraahman Dabbagh <1924466@stud.hs-mannheim.de> Date: Wed, 25 Jun 2025 19:28:19 +0200 Subject: [PATCH 2/7] Dynamischer Prompt mit API-KPIs, Fortschritt integriert --- .../exxetaGPT-service/extractExxeta.py | 70 +++++++++---------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/project/backend/exxetaGPT-service/extractExxeta.py b/project/backend/exxetaGPT-service/extractExxeta.py index c554f0c..a6f922d 100644 --- a/project/backend/exxetaGPT-service/extractExxeta.py +++ b/project/backend/exxetaGPT-service/extractExxeta.py @@ -1,7 +1,6 @@ import requests import json import os -import time import logging from dotenv import load_dotenv @@ -17,6 +16,18 @@ TIMEOUT = 180 logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +def get_dynamic_labels(): + url = f"{COORDINATOR_URL}/api/kpi_setting/" + try: + response = requests.get(url, timeout=10) + response.raise_for_status() + kpi_list = response.json() + labels = [kpi["name"].upper() for kpi in kpi_list if kpi.get("active", False)] + return labels + except Exception as e: + logger.warning(f"Konnte dynamische Labels nicht laden: {e}") + return [] + def extract_with_exxeta(pages_json, pitchbook_id): results = [] @@ -30,9 +41,7 @@ def extract_with_exxeta(pages_json, pitchbook_id): if i % 8 == 0: requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 35 + 60/len(pages_json)*i}) - page_num = page_data.get("page") - page_data.get("page") text = page_data.get("text", "") if not text: @@ -51,9 +60,9 @@ def extract_with_exxeta(pages_json, pitchbook_id): "- Gib die Antwort als **JSON-Array** im folgenden Format zurück:\n\n" "[\n" " {\n" - " \"label\": \"FONDSNAME\",\n" - " \"entity\": \"...\",\n" - f" \"page\": {page_num},\n" + ' "label": "FONDSNAME",\n' + ' "entity": "...",\n' + f' "page": {page_num},\n' " },\n" " ...\n" "]\n\n" @@ -61,45 +70,29 @@ def extract_with_exxeta(pages_json, pitchbook_id): f"TEXT:\n{text}" ) else: + labels = get_dynamic_labels() + prompt_kennzahlen = "".join([f"- {label}\n" for label in labels]) prompt = ( "Bitte extrahiere relevante Fondskennzahlen aus dem folgenden Pitchbook-Text. " "Analysiere den Text sorgfältig, um **nur exakt benannte und relevante Werte** zu extrahieren.\n\n" - "ZU EXTRAHIERENDE KENNZAHLEN (immer exakt wie unten angegeben):\n" - "- FONDSNAME\n" - "- FONDSMANAGER\n" - "- AIFM (z. B. Name Kapitalverwaltungsgesellschaft)\n" - "- DATUM\n" - "- RISIKOPROFIL (z. B. CORE, CORE+, VALUE-ADDED, OPPORTUNISTISCH)\n" - "- ARTIKEL (z. B. ARTIKEL 6, 8, 9)\n" - "- ZIELRENDITE\n" - "- RENDITE\n" - "- ZIELAUSSCHÜTTUNG\n" - "- AUSSCHÜTTUNG\n" - "- LAUFZEIT\n" - "- LTV\n" - "- MANAGEMENTGEBÜHREN (ggf. mit Staffelung und Bezug auf NAV/GAV)\n" - "- SEKTORENALLOKATION (z. B. BÜRO, LOGISTIK, WOHNEN... inkl. %-Angaben)\n" - "- LÄNDERALLOKATION (z. B. DEUTSCHLAND, FRANKREICH, etc. inkl. %-Angaben)\n\n" - + f"{prompt_kennzahlen}\n" "WICHTIG:\n" "- Gib **nur eine Entität pro Kennzahl** an - keine Listen oder Interpretationen.\n" - "- Wenn mehrere Varianten genannt werden (z. B. \"Core und Core+\"), gib sie im Originalformat als **eine entity** an.\n" + '- Wenn mehrere Varianten genannt werden (z. B. "Core und Core+"), gib sie im Originalformat als **eine entity** an.\n' "- **Keine Vermutungen oder Ergänzungen**. Wenn keine Information enthalten ist, gib die Kennzahl **nicht aus**.\n" "- Extrahiere **nur wörtlich vorkommende Inhalte** (keine Berechnungen, keine Zusammenfassungen).\n" "- Jeder gefundene Wert muss einem der obigen Label **eindeutig zuordenbar** sein.\n\n" - "FORMAT:\n" "Antworte als **reines JSON-Array** mit folgendem Format:\n" "[\n" " {\n" - " \"label\": \"Kennzahlname (exakt wie oben)\",\n" - " \"entity\": \"Wert aus dem Text (exakt im Original)\",\n" - f" \"page\": {page_num},\n" + ' "label": "Kennzahlname (exakt wie oben)",\n' + ' "entity": "Wert aus dem Text (exakt im Original)",\n' + f' "page": {page_num},\n' " },\n" " ...\n" "]\n\n" - f"Falls keine Kennzahl enthalten ist, gib ein leeres Array [] zurück.\n\n" f"Nur JSON-Antwort - keine Kommentare, keine Erklärungen, kein Text außerhalb des JSON.\n\n" f"TEXT:\n{text}" @@ -119,16 +112,17 @@ def extract_with_exxeta(pages_json, pitchbook_id): "temperature": 0.0 } + print("\n==== Dynamisch gebauter Prompt ====\n") + print(prompt) + print("\n===================================\n") + url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" for attempt in range(1, MAX_RETRIES + 1): try: response = requests.post(url, headers=headers, json=payload, timeout=TIMEOUT) response.raise_for_status() - - content = response.json()["choices"][0]["message"]["content"] - content = content.strip() - + content = response.json()["choices"][0]["message"]["content"].strip() if content.startswith("```json"): content = content.split("```json")[1] if content.endswith("```"): @@ -143,14 +137,16 @@ def extract_with_exxeta(pages_json, pitchbook_id): if isinstance(page_results, list): results.extend(page_results) break - - except requests.exceptions.RequestException as e: + except requests.exceptions.RequestException: if attempt == MAX_RETRIES: results.extend([]) - except Exception as e: + except Exception: if attempt == MAX_RETRIES: results.extend([]) - requests.post(COORDINATOR_URL + "/api/progress", json={"id": pitchbook_id, "progress": 95}) return json.dumps(results, indent=2, ensure_ascii=False) + +if __name__ == "__main__": + print("📡 Test-Aufruf get_dynamic_labels:") + print(get_dynamic_labels()) \ No newline at end of file From dc9d693768a8e6a078d3fb2e4f18c0c6165b5713 Mon Sep 17 00:00:00 2001 From: Abdulrahman Dabbagh <1924466@stud.hs-mannheim.de> Date: Thu, 26 Jun 2025 03:48:44 +0200 Subject: [PATCH 3/7] project/backend/exxetaGPT-service/extractExxeta.py aktualisiert Prints entfernt --- project/backend/exxetaGPT-service/extractExxeta.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/project/backend/exxetaGPT-service/extractExxeta.py b/project/backend/exxetaGPT-service/extractExxeta.py index a6f922d..3948c8b 100644 --- a/project/backend/exxetaGPT-service/extractExxeta.py +++ b/project/backend/exxetaGPT-service/extractExxeta.py @@ -112,10 +112,6 @@ def extract_with_exxeta(pages_json, pitchbook_id): "temperature": 0.0 } - print("\n==== Dynamisch gebauter Prompt ====\n") - print(prompt) - print("\n===================================\n") - url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" for attempt in range(1, MAX_RETRIES + 1): From 0a64411a5fc14a024a41e9890a165cda39602129 Mon Sep 17 00:00:00 2001 From: s8613 Date: Thu, 26 Jun 2025 20:22:19 +0200 Subject: [PATCH 4/7] Start for mandatory kpi in config --- project/frontend/src/components/ConfigTable.tsx | 3 +++ project/frontend/src/components/UploadPage.tsx | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/project/frontend/src/components/ConfigTable.tsx b/project/frontend/src/components/ConfigTable.tsx index f835021..2a9ae34 100644 --- a/project/frontend/src/components/ConfigTable.tsx +++ b/project/frontend/src/components/ConfigTable.tsx @@ -330,6 +330,9 @@ export function ConfigTable({ from }: ConfigTableProps) { > {kennzahl.name} + {kennzahl.mandatory && ( + * + )} diff --git a/project/frontend/src/components/UploadPage.tsx b/project/frontend/src/components/UploadPage.tsx index b54e7a3..1442ee6 100644 --- a/project/frontend/src/components/UploadPage.tsx +++ b/project/frontend/src/components/UploadPage.tsx @@ -121,7 +121,7 @@ export default function UploadPage() { fontWeight: "bold", color: "#383838", marginBottom: 12, - marginTop: 6, + marginTop: 3, }} > Pitchbook Extractor From 6fef07ac8650d7269cee11c342721530d96631f4 Mon Sep 17 00:00:00 2001 From: s8613 Date: Thu, 26 Jun 2025 20:42:43 +0200 Subject: [PATCH 5/7] Tooltip for added --- .../src/components/PitchBooksTable.tsx | 2 +- .../frontend/src/components/UploadPage.tsx | 2 +- .../src/routes/extractedResult.$pitchBook.tsx | 26 ++++++++++++++----- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/project/frontend/src/components/PitchBooksTable.tsx b/project/frontend/src/components/PitchBooksTable.tsx index bd7e5e7..8eaf63f 100644 --- a/project/frontend/src/components/PitchBooksTable.tsx +++ b/project/frontend/src/components/PitchBooksTable.tsx @@ -360,7 +360,7 @@ export function PitchBooksTable() { {status === "completed" ? ( } - label="Abgeschlossen" + label="Extraktion Abgeschlossen" size="small" sx={{ backgroundColor: "#e8f5e9", diff --git a/project/frontend/src/components/UploadPage.tsx b/project/frontend/src/components/UploadPage.tsx index 1442ee6..4c79be0 100644 --- a/project/frontend/src/components/UploadPage.tsx +++ b/project/frontend/src/components/UploadPage.tsx @@ -207,7 +207,7 @@ export default function UploadPage() { onMouseEnter={() => router.preloadRoute({ to: "/pitchbooks" })} onClick={() => navigate({ to: "/pitchbooks" })} > - Alle Pitch Books anzeigen + Alle Pitchbooks anzeigen diff --git a/project/frontend/src/routes/extractedResult.$pitchBook.tsx b/project/frontend/src/routes/extractedResult.$pitchBook.tsx index d579338..cc7c878 100644 --- a/project/frontend/src/routes/extractedResult.$pitchBook.tsx +++ b/project/frontend/src/routes/extractedResult.$pitchBook.tsx @@ -1,5 +1,5 @@ import ContentPasteIcon from "@mui/icons-material/ContentPaste"; -import { Box, Button, Paper, Typography, Snackbar, Alert, IconButton } from "@mui/material"; +import { Box, Button, Paper, Typography, Snackbar, Alert, IconButton, Tooltip } from "@mui/material"; import ArrowBackIcon from "@mui/icons-material/ArrowBack"; import { useSuspenseQuery } from "@tanstack/react-query"; import { createFileRoute, useNavigate } from "@tanstack/react-router"; @@ -235,17 +235,29 @@ function ExtractedResultsPage() { gap={2} sx={{ flexShrink: 0 }} > - + + Kennzahlen kopieren +
+ Kopiert alle aktiven Kennzahlen als Excel-Zeile in die Zwischenablage. Kann direkt in Excel eingefügt werden. + + } + placement="top" + arrow + > + +
From 019e10d5b8139c14026499a2282dee3e75547f76 Mon Sep 17 00:00:00 2001 From: s8613 Date: Thu, 26 Jun 2025 20:54:49 +0200 Subject: [PATCH 6/7] Fondsname added with dynamic changes. --- project/frontend/src/routes/extractedResult.$pitchBook.tsx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/project/frontend/src/routes/extractedResult.$pitchBook.tsx b/project/frontend/src/routes/extractedResult.$pitchBook.tsx index cc7c878..e9a3e79 100644 --- a/project/frontend/src/routes/extractedResult.$pitchBook.tsx +++ b/project/frontend/src/routes/extractedResult.$pitchBook.tsx @@ -50,6 +50,8 @@ function ExtractedResultsPage() { const { data: kpi } = useSuspenseQuery(kpiQueryOptions(pitchBook)); const { data: settings } = useSuspenseQuery(settingsQueryOptions()); + const fundName = kpi["FONDSNAME"]?.[0]?.entity; + const status = useMemo(() => { let hasRedBorders = false; let hasYellowBorders = false; @@ -158,7 +160,9 @@ function ExtractedResultsPage() { }} /> - Extrahierte Kennzahlen + + {fundName ? `Kennzahlen extrahiert aus: ${fundName}` : "Extrahierte Kennzahlen"} + Date: Thu, 26 Jun 2025 22:26:45 +0200 Subject: [PATCH 7/7] Adjust time zone offset for date formatting Add 2 hours to the hour component to account for the time zone difference in the application context --- project/frontend/src/util/date.ts | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/project/frontend/src/util/date.ts b/project/frontend/src/util/date.ts index 13dbfa8..763e47c 100644 --- a/project/frontend/src/util/date.ts +++ b/project/frontend/src/util/date.ts @@ -1,11 +1,11 @@ export const formatDate = (dateString: string): string => { - const date = new Date(dateString); + const date = new Date(dateString); - const hours = String(date.getHours()).padStart(2, '0'); - const minutes = String(date.getMinutes()).padStart(2, '0'); - const month = String(date.getMonth() + 1).padStart(2, '0'); // Months are zero-based - const day = String(date.getDate()).padStart(2, '0'); - const year = date.getFullYear(); + const hours = String(date.getHours() + 2).padStart(2, "0"); + const minutes = String(date.getMinutes()).padStart(2, "0"); + const month = String(date.getMonth() + 1).padStart(2, "0"); // Months are zero-based + const day = String(date.getDate()).padStart(2, "0"); + const year = date.getFullYear(); - return `${hours}:${minutes} ${day}.${month}.${year}`; + return `${hours}:${minutes} ${day}.${month}.${year}`; };