From 14b66a31b8a8b9bf18e1cd9746b699c6e2f3de1f Mon Sep 17 00:00:00 2001 From: s8613 Date: Sat, 26 Apr 2025 15:36:02 +0200 Subject: [PATCH] Added arc1 validation process spacy results and exxeta checks. and also the option that both check each other. --- prototypes/mcp_validate-arc1/config.py | 3 + .../mcp_spacy_validate_with_exxeta.py | 97 ++++++ .../mcp_spacy_validated_result.json | 200 +++++++++++ prototypes/mcp_validate-arc1/mcp_validate.py | 100 ++++++ .../mcp_validated_result.json | 314 ++++++++++++++++++ 5 files changed, 714 insertions(+) create mode 100644 prototypes/mcp_validate-arc1/config.py create mode 100644 prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py create mode 100644 prototypes/mcp_validate-arc1/mcp_spacy_validated_result.json create mode 100644 prototypes/mcp_validate-arc1/mcp_validate.py create mode 100644 prototypes/mcp_validate-arc1/mcp_validated_result.json diff --git a/prototypes/mcp_validate-arc1/config.py b/prototypes/mcp_validate-arc1/config.py new file mode 100644 index 0000000..3b27716 --- /dev/null +++ b/prototypes/mcp_validate-arc1/config.py @@ -0,0 +1,3 @@ +EXXETA_API_KEY = "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ0b2tlbiI6IjIzYzA0NGEzOWY5OWIxMjdmODA5ODA0YmMxZTczN2UyIn0.uOD9GhvFl1hqd2B3dyb0IOJ4x_o1IPcMckeQxh2KNj0" +EXXETA_BASE_URL = "https://ai.exxeta.com/api/v2/azure/openai" +MODEL_ID = "gpt-35-turbo" \ No newline at end of file diff --git a/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py b/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py new file mode 100644 index 0000000..ec97452 --- /dev/null +++ b/prototypes/mcp_validate-arc1/mcp_spacy_validate_with_exxeta.py @@ -0,0 +1,97 @@ +from config import EXXETA_API_KEY, EXXETA_BASE_URL +import requests +import json +from pathlib import Path + +MODEL = "gpt-35-turbo" + +SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json" +OCR_PITCHBOOK_PATH = "../ocr/output/Pitchbook 1-OCR.json" +OUTPUT_PATH = "mcp_spacy_validated_result.json" + +def load_spacy_entities(): + with open(SPACY_ENTITIES_PATH, "r", encoding="utf-8") as f: + return json.load(f) + +def load_pitchbook_pages(): + with open(OCR_PITCHBOOK_PATH, "r", encoding="utf-8") as f: + return json.load(f) + +def get_page_text(pages, page_number): + for page in pages: + if page.get("page") == page_number: + return page.get("text", "") + return "" + +def normalize_entity(entity): + return ' '.join(entity.replace('\n', ' ').split()) + +def validate_entity_with_exxeta(entity, page_num, text): + prompt = ( + f"Du bist ein Validator für extrahierte Begriffe aus OCR-Texten.\n\n" + f"Ziel: Überprüfe, ob die folgende Ziel-Formulierung im angegebenen OCR-Text auf Seite {page_num} vorkommt.\n\n" + f"Ziel-Formulierung:\n" + f"\"{entity}\"\n\n" + f"Validierungsregeln:\n" + f"- Groß- und Kleinschreibung ignorieren.\n" + f"- Zusätzliche oder fehlende Leerzeichen, Zeilenumbrüche, Kommas, Schrägstriche ('/') oder Wörter wie 'und'/'or' zwischen Begriffen sind erlaubt.\n" + f"- Leichte OCR-Fehler sind zulässig (z.B. fehlende oder doppelte Buchstaben).\n" + f"- Wenn die Begriffe zusammen im selben Kontext stehen, zählt das als Treffer.\n" + f"- Antworte **ausschließlich** mit \"true\" (Treffer) oder \"false\" (kein Treffer).\n" + f"- Keine Kommentare, keine Begründungen, nur 'true' oder 'false'.\n\n" + f"OCR-Text auf Seite {page_num}:\n{text}" + ) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {EXXETA_API_KEY}" + } + + payload = { + "model": MODEL, + "messages": [ + {"role": "system", "content": "Du bist ein Validierungsassistent. Antwort nur mit true oder false."}, + {"role": "user", "content": prompt} + ], + "temperature": 0.0 + } + + url = f"{EXXETA_BASE_URL}/deployments/{MODEL}/chat/completions" + + try: + response = requests.post(url, headers=headers, json=payload) + response.raise_for_status() + content = response.json()["choices"][0]["message"]["content"].strip().lower() + return "true" in content + except Exception as e: + print(f"⚠️ Validation failed for '{entity}' on page {page_num}: {e}") + return False + +def run(): + spacy_entities = load_spacy_entities() + pitchbook_pages = load_pitchbook_pages() + + validated_results = [] + + for entity_data in spacy_entities: + raw_entity = entity_data.get("entity") + page = entity_data.get("page") + entity = normalize_entity(raw_entity) + + page_text = get_page_text(pitchbook_pages, page) + is_valid = validate_entity_with_exxeta(entity, page, page_text) + + validated_results.append({ + "label": entity_data.get("label"), + "entity": raw_entity, + "page": page, + "validated": is_valid + }) + + with open(OUTPUT_PATH, "w", encoding="utf-8") as f: + json.dump(validated_results, f, indent=2, ensure_ascii=False) + + print(f"✅ Validation complete! Results saved to {OUTPUT_PATH}") + +if __name__ == "__main__": + run() \ No newline at end of file diff --git a/prototypes/mcp_validate-arc1/mcp_spacy_validated_result.json b/prototypes/mcp_validate-arc1/mcp_spacy_validated_result.json new file mode 100644 index 0000000..05a8a65 --- /dev/null +++ b/prototypes/mcp_validate-arc1/mcp_spacy_validated_result.json @@ -0,0 +1,200 @@ +[ + { + "label": "RISIKOPROFIL", + "entity": "Core and Core+", + "page": 4, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "core, core+, value-added", + "page": 7, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Core/Core+", + "page": 10, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 10, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Core/Core+", + "page": 10, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "UK, DE, BE, NL, LU,", + "page": 10, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "Core / Core +", + "page": 12, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "core\n/ core+", + "page": 12, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "core", + "page": 12, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Term / core+", + "page": 12, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 12, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "6,4 6,4", + "page": 13, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "Country /", + "page": 14, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore", + "page": 14, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "Country /", + "page": 15, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "Core\nCore\nCore\nCore\nCore\nCore", + "page": 15, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "countries, giving", + "page": 18, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 20, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "core/core+", + "page": 20, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "D, and", + "page": 21, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "UK, DE, BE, NL, LU,", + "page": 26, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "core or", + "page": 27, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "Core +", + "page": 27, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "kgCO,e", + "page": 30, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "C,", + "page": 32, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "KfW, Dwp", + "page": 35, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Bank,", + "page": 35, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 36, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 36, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 37, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 37, + "validated": true + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 38, + "validated": false + }, + { + "label": "RISIKOPROFIL", + "entity": "Core", + "page": 38, + "validated": true + } +] \ No newline at end of file diff --git a/prototypes/mcp_validate-arc1/mcp_validate.py b/prototypes/mcp_validate-arc1/mcp_validate.py new file mode 100644 index 0000000..699e2ee --- /dev/null +++ b/prototypes/mcp_validate-arc1/mcp_validate.py @@ -0,0 +1,100 @@ +import json +from pathlib import Path + +KPI_SERVICE_MAP = { + "risikoprofil": ["spacy", "exxeta"], + # "fondsname": ["exxeta"], + # "fundvolume": ["spacy", "exxeta"], +} + +SPACY_ENTITIES_PATH = "../fine_tuning_spaCy/entities_output.json" +EXXETA_ENTITIES_PATH = "../merge_validate-arc2/exxeta_result.json" + +def load_spacy_entities(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + +def load_exxeta_entities(path): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + +def normalize(text): + if not text: + return "" + return text.strip().lower().replace(" ", "").replace("/", "/") + +def validate_kpi(kpi, spacy_entities, exxeta_entities): + results = [] + + spacy_kpi = [e for e in spacy_entities if e.get("label", "").lower() == kpi] + exxeta_kpi = [e for e in exxeta_entities if e.get("label", "").lower() == kpi] + + spacy_by_page = {} + for e in spacy_kpi: + spacy_by_page.setdefault(e["page"], []).append(e) + + exxeta_by_page = {} + for e in exxeta_kpi: + exxeta_by_page.setdefault(e["page"], []).append(e) + + all_pages = set(spacy_by_page.keys()).union(exxeta_by_page.keys()) + + for page in sorted(all_pages): + spacy_entries = spacy_by_page.get(page, []) + exxeta_entries = exxeta_by_page.get(page, []) + + for se in spacy_entries: + se_entity = normalize(se["entity"]) + + matched = False + for ee in exxeta_entries: + ee_entity = normalize(ee["entity"]) + if se_entity == ee_entity: + results.append({ + "kpi": kpi, + "entity": se["entity"], + "page": page, + "validation_status": "validated" + }) + matched = True + break + + if not matched: + results.append({ + "kpi": kpi, + "entity": se["entity"], + "page": page, + "validation_status": "spacy-only" + }) + + for ee in exxeta_entries: + ee_entity = normalize(ee["entity"]) + if not any(normalize(se["entity"]) == ee_entity for se in spacy_entries): + results.append({ + "kpi": kpi, + "entity": ee["entity"], + "page": page, + "validation_status": "exxeta-only" + }) + + return results + +def save_results(results, filename): + with open(filename, "w", encoding="utf-8") as f: + json.dump(results, f, indent=2, ensure_ascii=False) + +def run(): + spacy_entities = load_spacy_entities(SPACY_ENTITIES_PATH) + exxeta_entities = load_exxeta_entities(EXXETA_ENTITIES_PATH) + + all_results = [] + + for kpi, services in KPI_SERVICE_MAP.items(): + results = validate_kpi(kpi, spacy_entities, exxeta_entities) + all_results.extend(results) + + save_results(all_results, "mcp_validated_result.json") + print("✅ Validation complete! Output: mcp_validated_result.json") + +if __name__ == "__main__": + run() diff --git a/prototypes/mcp_validate-arc1/mcp_validated_result.json b/prototypes/mcp_validate-arc1/mcp_validated_result.json new file mode 100644 index 0000000..f022b51 --- /dev/null +++ b/prototypes/mcp_validate-arc1/mcp_validated_result.json @@ -0,0 +1,314 @@ +[ + { + "kpi": "risikoprofil", + "entity": "Core and Core+", + "page": 4, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "core, core+, value-added", + "page": 7, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 9, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core/Core+", + "page": 10, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "core/core+", + "page": 10, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "Core/Core+", + "page": 10, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "UK, DE, BE, NL, LU,", + "page": 10, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core / Core +", + "page": 12, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "core\n/ core+", + "page": 12, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "core", + "page": 12, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Term / core+", + "page": 12, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "core/core+", + "page": 12, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "6,4 6,4", + "page": 13, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Country /", + "page": 14, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore", + "page": 14, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 14, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 14, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 14, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 14, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 14, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Country /", + "page": 15, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core\nCore\nCore\nCore\nCore\nCore", + "page": 15, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 15, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 15, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 15, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 15, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 15, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "countries, giving", + "page": 18, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 19, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "core/core+", + "page": 20, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "core/core+", + "page": 20, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "D, and", + "page": 21, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "UK, DE, BE, NL, LU,", + "page": 26, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 26, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "core or", + "page": 27, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core +", + "page": 27, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core Offices, Core + assets", + "page": 27, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "kgCO,e", + "page": 30, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "C,", + "page": 32, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core, Core+", + "page": 33, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "KfW, Dwp", + "page": 35, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Bank,", + "page": 35, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 35, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 35, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 36, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 36, + "validation_status": "spacy-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core Parking", + "page": 36, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core Parking", + "page": 36, + "validation_status": "exxeta-only" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 37, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 37, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 38, + "validation_status": "validated" + }, + { + "kpi": "risikoprofil", + "entity": "Core", + "page": 38, + "validation_status": "validated" + } +] \ No newline at end of file