diff --git a/prototypes/fine_tuning_spaCy/__pycache__/training_data.cpython-311.pyc b/prototypes/fine_tuning_spaCy/__pycache__/training_data.cpython-311.pyc index 3cd4fc0..c70e2f9 100644 Binary files a/prototypes/fine_tuning_spaCy/__pycache__/training_data.cpython-311.pyc and b/prototypes/fine_tuning_spaCy/__pycache__/training_data.cpython-311.pyc differ diff --git a/prototypes/fine_tuning_spaCy/data/train.spacy b/prototypes/fine_tuning_spaCy/data/train.spacy index 17fd534..7d9dfbb 100644 Binary files a/prototypes/fine_tuning_spaCy/data/train.spacy and b/prototypes/fine_tuning_spaCy/data/train.spacy differ diff --git a/prototypes/fine_tuning_spaCy/entities_output.json b/prototypes/fine_tuning_spaCy/entities_output.json index 4e58e7f..6d98983 100644 --- a/prototypes/fine_tuning_spaCy/entities_output.json +++ b/prototypes/fine_tuning_spaCy/entities_output.json @@ -1,24 +1,4 @@ [ - { - "label": "RISIKOPROFIL", - "entity": "Content", - "page": 2 - }, - { - "label": "RISIKOPROFIL", - "entity": "Case & Views", - "page": 2 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 2 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 3 - }, { "label": "RISIKOPROFIL", "entity": "Core and Core+", @@ -26,114 +6,14 @@ }, { "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 5 - }, - { - "label": "RISIKOPROFIL", - "entity": "CITIES \n-", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "UK,", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "Czech Republic,", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "COMPLETE", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "closed-end and", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "Club Deals", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "Ventures;", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "Office", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "lndustrial/logistics", - "page": 6 - }, - { - "label": "RISIKOPROFIL", - "entity": "Comprehensive service", + "entity": "core, core+, value-added", "page": 7 }, - { - "label": "RISIKOPROFIL", - "entity": "Club Deals", - "page": 7 - }, - { - "label": "RISIKOPROFIL", - "entity": "Club Deals: investing with similar-minded", - "page": 7 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 7 - }, - { - "label": "RISIKOPROFIL", - "entity": "core, core+", - "page": 7 - }, - { - "label": "RISIKOPROFIL", - "entity": "86", - "page": 8 - }, - { - "label": "RISIKOPROFIL", - "entity": "100", - "page": 8 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 8 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 9 - }, { "label": "RISIKOPROFIL", "entity": "Core/Core+", "page": 10 }, - { - "label": "RISIKOPROFIL", - "entity": "Cities", - "page": 10 - }, { "label": "RISIKOPROFIL", "entity": "core/core+", @@ -146,122 +26,27 @@ }, { "label": "RISIKOPROFIL", - "entity": "Europe", + "entity": "UK, DE, BE, NL, LU,", "page": 10 }, { "label": "RISIKOPROFIL", - "entity": "40%", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "UK,", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "NL,", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "LU,", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "Nordics,", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "IT,", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "25% in", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core: max", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "Concentration limits:", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "Cash: 4", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 10 - }, - { - "label": "RISIKOPROFIL", - "entity": "(€1 Bn of", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "AIF", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Form:", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Currency: EUR", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "(Dec", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Management/ Fund", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "+ M€). Fee", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "IRR (payable by investors", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 11 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core / Core", + "entity": "Core / Core +", "page": 12 }, { "label": "RISIKOPROFIL", - "entity": "core \n/ core+", + "entity": "core\n/ core+", "page": 12 }, { "label": "RISIKOPROFIL", - "entity": "core \n/ core+", + "entity": "core", + "page": 12 + }, + { + "label": "RISIKOPROFIL", + "entity": "Term / core+", "page": 12 }, { @@ -271,274 +56,34 @@ }, { "label": "RISIKOPROFIL", - "entity": "8,9", + "entity": "6,4 6,4", "page": 13 }, { "label": "RISIKOPROFIL", - "entity": "8,2", - "page": 13 - }, - { - "label": "RISIKOPROFIL", - "entity": "Comments", - "page": 13 - }, - { - "label": "RISIKOPROFIL", - "entity": "PEPFI: Pan", - "page": 13 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 13 - }, - { - "label": "RISIKOPROFIL", - "entity": "2019", + "entity": "Country /", "page": 14 }, { "label": "RISIKOPROFIL", - "entity": "Country / city", + "entity": "Core\nCore\nCore\nCore\nCore\nCore\nCore\nCore", "page": 14 }, { "label": "RISIKOPROFIL", - "entity": "Comments", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Offices", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n80m", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Commission - well", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n40m", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "District -", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "100", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Completely", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n<50m \nGood", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n400m \nGood", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n300m", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n99m-102m", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Leopold / Location", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "1992 / WAL", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n85m-90m", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n50m-55m", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "2020", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 14 - }, - { - "label": "RISIKOPROFIL", - "entity": "2019", + "entity": "Country /", "page": 15 }, { "label": "RISIKOPROFIL", - "entity": "Country / city", + "entity": "Core\nCore\nCore\nCore\nCore\nCore", "page": 15 }, - { - "label": "RISIKOPROFIL", - "entity": "Brussels", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Denis", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n34-", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n44m-46m \nOffices", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n100-150m", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Offices", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n90-", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Offices", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \n150 -170", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Location", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "CBD", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "Tenancy", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "2020 -", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "centre,", - "page": 15 - }, - { - "label": "RISIKOPROFIL", - "entity": "19", - "page": 16 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 16 - }, - { - "label": "RISIKOPROFIL", - "entity": "Committee", - "page": 17 - }, - { - "label": "RISIKOPROFIL", - "entity": "2.3bn)", - "page": 17 - }, - { - "label": "RISIKOPROFIL", - "entity": "6.0 Bn*", - "page": 17 - }, { "label": "RISIKOPROFIL", "entity": "countries, giving", "page": 18 }, - { - "label": "RISIKOPROFIL", - "entity": "Country", - "page": 18 - }, - { - "label": "RISIKOPROFIL", - "entity": "Leasing", - "page": 18 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 18 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 19 - }, - { - "label": "RISIKOPROFIL", - "entity": "Coordination with property", - "page": 19 - }, { "label": "RISIKOPROFIL", "entity": "core/core+", @@ -551,612 +96,72 @@ }, { "label": "RISIKOPROFIL", - "entity": "Source :", - "page": 20 - }, - { - "label": "RISIKOPROFIL", - "entity": "COP", + "entity": "D, and", "page": 21 }, { "label": "RISIKOPROFIL", - "entity": "21", - "page": 21 - }, - { - "label": "RISIKOPROFIL", - "entity": "Committee", - "page": 22 - }, - { - "label": "RISIKOPROFIL", - "entity": "Research,", - "page": 22 - }, - { - "label": "RISIKOPROFIL", - "entity": "Controlling, Liability", - "page": 22 - }, - { - "label": "RISIKOPROFIL", - "entity": "Conducting", - "page": 22 - }, - { - "label": "RISIKOPROFIL", - "entity": "Officer,", - "page": 22 - }, - { - "label": "RISIKOPROFIL", - "entity": "Officer", - "page": 22 - }, - { - "label": "RISIKOPROFIL", - "entity": "Views", - "page": 23 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 23 - }, - { - "label": "RISIKOPROFIL", - "entity": "Market", - "page": 24 - }, - { - "label": "RISIKOPROFIL", - "entity": "Views", - "page": 24 - }, - { - "label": "RISIKOPROFIL", - "entity": "S1 2010 -", - "page": 24 - }, - { - "label": "RISIKOPROFIL", - "entity": "2019", - "page": 24 - }, - { - "label": "RISIKOPROFIL", - "entity": "Market", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "Views", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "V", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "S1", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "2010-", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "2019", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "Vienne", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "Prague", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "Milan", - "page": 25 - }, - { - "label": "RISIKOPROFIL", - "entity": "Market", + "entity": "UK, DE, BE, NL, LU,", "page": 26 }, { "label": "RISIKOPROFIL", - "entity": "Views", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Conviction - Strong", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Conviction - Medium", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Conviction - Low", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Hospitality", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Hotels,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "UK,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "NL,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "LU,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Nordics,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "SP,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "IT,", - "page": 26 - }, - { - "label": "RISIKOPROFIL", - "entity": "Current views accross", + "entity": "core or", "page": 27 }, { "label": "RISIKOPROFIL", - "entity": "Conviction -", + "entity": "Core +", "page": 27 }, { "label": "RISIKOPROFIL", - "entity": "Conviction - Medium", - "page": 27 + "entity": "kgCO,e", + "page": 30 }, { "label": "RISIKOPROFIL", - "entity": "Conviction - Low", - "page": 27 + "entity": "C,", + "page": 32 }, { "label": "RISIKOPROFIL", - "entity": "core can m", - "page": 27 + "entity": "KfW, Dwp", + "page": 35 + }, + { + "label": "RISIKOPROFIL", + "entity": "Bank,", + "page": 35 }, { "label": "RISIKOPROFIL", "entity": "Core", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "core strategies could", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core Offices, quality Retail asset", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "Czech Rep", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "Views", - "page": 27 - }, - { - "label": "RISIKOPROFIL", - "entity": "Information /", - "page": 28 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 28 - }, - { - "label": "RISIKOPROFIL", - "entity": "71/ 100", - "page": 29 - }, - { - "label": "RISIKOPROFIL", - "entity": "C", - "page": 29 - }, - { - "label": "RISIKOPROFIL", - "entity": "86", - "page": 29 - }, - { - "label": "RISIKOPROFIL", - "entity": "Carbone", - "page": 30 - }, - { - "label": "RISIKOPROFIL", - "entity": "(%", - "page": 30 - }, - { - "label": "RISIKOPROFIL", - "entity": "35", - "page": 30 - }, - { - "label": "RISIKOPROFIL", - "entity": "C) Exposure", - "page": 31 - }, - { - "label": "RISIKOPROFIL", - "entity": "canicules,", - "page": 31 - }, - { - "label": "RISIKOPROFIL", - "entity": "Canicules", - "page": 31 - }, - { - "label": "RISIKOPROFIL", - "entity": "ESG", - "page": 32 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 32 - }, - { - "label": "RISIKOPROFIL", - "entity": "calculation,", - "page": 33 - }, - { - "label": "RISIKOPROFIL", - "entity": "1/5", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nClosed in 2018", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "Portico", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nClosed in 2018", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value:", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "CoC:", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "CoC:", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "Nedeland", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "Comments", - "page": 34 - }, - { - "label": "RISIKOPROFIL", - "entity": "2/5", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Comments", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nParking 64", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value:", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "CoC:", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Commercial 1,028 sqm", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nParking 347", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value:", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "141 ,2m", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 35 - }, - { - "label": "RISIKOPROFIL", - "entity": "3/5", "page": 36 }, { "label": "RISIKOPROFIL", - "entity": "Deal", + "entity": "Core", "page": 36 }, { "label": "RISIKOPROFIL", - "entity": "Central", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nClosed in", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nClosed in", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "Value:", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "m2 \nStorage 1,636", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "m2", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "100%", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "IRR 1 DY:", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "(years):", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "Cash-on-cash:", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "IRR 1 DY:", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "(occupancy 96%", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "net", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "Comments", - "page": 36 - }, - { - "label": "RISIKOPROFIL", - "entity": "4/5", + "entity": "Core", "page": 37 }, { "label": "RISIKOPROFIL", - "entity": "Comments", + "entity": "Core", "page": 37 }, { "label": "RISIKOPROFIL", - "entity": "Defense", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "182,765 m2", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Defense", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "RTE,", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nWALB (years):", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "1 ,", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "IRR 10Y", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Hekla", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Defense", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nWALB (years):", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "IRR 10Y", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", - "page": 37 - }, - { - "label": "RISIKOPROFIL", - "entity": "5/5", + "entity": "Core", "page": 38 }, { "label": "RISIKOPROFIL", - "entity": "m2", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nTenants:", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "(occupancy 68", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "IRR 10Y", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "m2", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "Core \nTenants:", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "(occupancy 98.4%", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "9.7", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "IRR 10Y", - "page": 38 - }, - { - "label": "RISIKOPROFIL", - "entity": "Europe", + "entity": "Core", "page": 38 } ] \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-best/meta.json b/prototypes/fine_tuning_spaCy/output/model-best/meta.json index ca58b77..daabc76 100644 --- a/prototypes/fine_tuning_spaCy/output/model-best/meta.json +++ b/prototypes/fine_tuning_spaCy/output/model-best/meta.json @@ -46,7 +46,7 @@ "f":1.0 } }, - "tok2vec_loss":0.000000011, - "ner_loss":0.0000000457 + "tok2vec_loss":0.000000029, + "ner_loss":0.0000000614 } } \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-best/ner/model b/prototypes/fine_tuning_spaCy/output/model-best/ner/model index 4909428..8a0c5ae 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-best/ner/model and b/prototypes/fine_tuning_spaCy/output/model-best/ner/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-best/ner/moves b/prototypes/fine_tuning_spaCy/output/model-best/ner/moves index e27560d..e72ba15 100644 --- a/prototypes/fine_tuning_spaCy/output/model-best/ner/moves +++ b/prototypes/fine_tuning_spaCy/output/model-best/ner/moves @@ -1 +1 @@ -movesx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file +movesx{"0":{},"1":{"RISIKOPROFIL":45},"2":{"RISIKOPROFIL":45},"3":{"RISIKOPROFIL":45},"4":{"RISIKOPROFIL":45,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model index 0bf1b64..1cfa6a5 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model and b/prototypes/fine_tuning_spaCy/output/model-best/tok2vec/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json b/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json index 4293997..7569f1d 100644 --- a/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json +++ b/prototypes/fine_tuning_spaCy/output/model-best/vocab/strings.json @@ -4,6 +4,8 @@ " ", " ", "\"", + "$", + "%", "'", "''", "'-(", @@ -46,6 +48,8 @@ ")/\u00af", "):", "*", + "+", + ",", "-", "-((", "-))", @@ -100,16 +104,40 @@ ".\u00e4.", "/", "/3", + "/Core+", + "/Xxxx+", + "/core+", "/d", + "/xxxx+", "0", + "0%+", "0.0", "0.o", + "022", + "032", + "034", "0_0", "0_o", "1", + "1.", + "10", + "12", + "2", + "2.", + "20", + "2022", + "2032", + "2034", + "250", "3", + "3.", "33", "333", + "35", + "5", + "50", + "7", + "7,50%+", "8", "8)", "8-", @@ -234,9 +262,20 @@ "Abt.", "Abteilung", "Add", + "Aktueller", + "Allgemeine", + "Amsterdam", + "Anlagestrategien", + "Anlagevehikels", + "Ansprechpartners", + "Antagevehikels", "Apr", "Apr.", "April", + "Art", + "Assets", + "Aufl\u00f6sung", + "Aufwertung", "Aug", "Aug.", "August", @@ -250,6 +289,8 @@ "Bd", "Bd.", "Beispiel", + "Berlin", + "Bestandsentwicklungen", "Betr", "Betr.", "Betreff", @@ -271,6 +312,8 @@ "Chr.", "Cie", "Cie.", + "Cities", + "Closings", "Co", "Co.", "Core", @@ -279,12 +322,15 @@ "D", "D.", "D.C.", + "Deutschland", + "Deutschlands", "Dez", "Dez.", "Dezember", "Di", "Di.", "Dienstag", + "Different", "Dipl", "Dipl.", "Dipl.-Ing", @@ -294,9 +340,14 @@ "Donnerstag", "Dr", "Dr.", + "D\u00e4nemark", "E", + "E-Mail", "E.", + "Einw", + "Europe", "European", + "Exit", "F", "F.", "FIL", @@ -310,20 +361,28 @@ "Februar", "Firma", "Fond", + "Fonds", "Fr", "Fr.", + "Frankreich", "Frau", "Frl", "Frl.", "Fr\u00e4ulein", + "Fu\u0308hrende", "G", "G.", "G.m.b", "G.m.b.H.", + "Gateway", "Gebr", "Gebr.", + "Geplantes", + "Gesamtrendite", "H", "H.", + "Halten", + "Halten-Strategie", "Hauptbahnhof", "Hbf", "Hbf.", @@ -343,13 +402,21 @@ "II.", "III", "III.", + "INREV", + "IRR", "IV", "IV.", + "Immobilien", + "Immobilien-Spezialfonds", "Inc", "Inc.", + "Informationen", "Ing", "Ing.", + "Investmentmanagers", "J", + "Jahr", + "Jahre", "Jahrhundert", "Jan", "Jan.", @@ -369,16 +436,26 @@ "K", "K.", "K.O.", + "Kaufen", + "Key", "L", "L'", "L.A.", + "LTV", + "LTY", + "Laufzeit", + "London", + "L\u00e4nderallokation", "L\u2019", "M", "M.", "M.A.", "M.Sc", "M.Sc.", + "Manager", + "Maximaler", "Mehrwertsteuer", + "Metropolregionen", "Mi", "Mi.", "Milliarde", @@ -403,6 +480,9 @@ "N", "N.Y.", "N.Y.C.", + "Nachvermietungsstrategie", + "Name", + "Niederlande", "Nov", "Nov.", "November", @@ -416,14 +496,18 @@ "O.o", "O_O", "O_o", + "Offen", + "Offener", "Okt", "Okt.", "Oktober", + "OpCo", "Orig", "Orig.", "Original", "P", "P.S.", + "Paris", "Pkt", "Pkt.", "Prof", @@ -434,32 +518,44 @@ "R.", "R.I.P.", "RE", + "REV", "RISIKOPROFIL", "ROOT", + "Rechtsform", "Red", "Red.", "Redaktion", + "Risikoprofil", + "Risk", "S", "S'", "Sa", "Sa.", "Samstag", "Sc.", + "Schweden", "Sep", "Sep.", "Sept", "Sept.", "September", + "Sitz", + "Skandinavien", "So", "So.", "Sonntag", "St", "St.", + "Standortaufwertungsstrategie", + "Standorte", "Std", "Std.", + "Stil", "Str", "Str.", + "Strategie", "Stra\u00dfe", + "Struktur", "Stunde", "S\u2019", "T", @@ -468,6 +564,7 @@ "Tel", "Tel.", "Telefon", + "Telefonnummer", "The", "Tr", "Tr.", @@ -488,8 +585,12 @@ "Value", "Vol", "Vol.", + "W", + "Wertstabile", + "Wohnimmobilien", "X'", "X++", + "X-Xxxx", "X.", "X.X", "X.X.", @@ -518,13 +619,25 @@ "Xxxx", "Xxxx+", "Xxxx+/Xxxxx", + "Xxxx-XXX", + "Xxxx-Xxxxx-XXX", "Xxxx.", "Xxxx.-Xxx", "Xxxx.-Xxx.", "Xxxxx", + "Xxxxx)/Xxxx", + "Xxxxx-Xxxxx", + "Xx\u0308xxxx", "X\u2019", "Z", "Z.", + "Ziel-LTY", + "Ziel-Netto-IRR", + "Zielanlagestrategie", + "Zielregionfen)/Jand", + "Zielsektoren", + "Zielvolumen", + "Ziirraiaein", "Zt", "Zt.", "[", @@ -578,21 +691,41 @@ "add", "adv", "adv.", + "ahr", + "ail", + "aiming", + "aktueller", "al", "al.", "allg", "allg.", "allgemein", + "allgemeine", "am.", + "ame", + "amsterdam", "an.", + "and", + "anlagestrategien", + "anlagevehikels", + "ansprechpartners", + "antagevehikels", "apr", "apr.", + "ark", + "art", + "asset", + "assets", + "at", "at.", "ath", "auf", + "aufl\u00f6sung", + "aufwertung", "auf\u2019m", "aug", "aug.", + "ausgew\u00e4hlte", "ax.", "b", "b.", @@ -604,6 +737,8 @@ "bd.", "beispielsweise", "ber", + "berlin", + "bestandsentwicklungen", "betr", "betr.", "beziehungsweise", @@ -632,23 +767,29 @@ "c/o", "ca", "ca.", + "cal", "cdu", "cdu/csu", "ce>", "chr", "chr.", + "cht", "cie", "cie.", + "cities", "cl.", + "closings", "co", "co.", "core", "core+", "core+/value", "csu", + "cts", "d", "d'", "d)", + "d,dd%+", "d-", "d-)", "d-X", @@ -661,30 +802,43 @@ "dX", "d_d", "d_x", + "dam", "dd", "ddd", + "dddd", "de", + "ded", "dem", + "den", + "der", "dergleichen", + "des", + "deutschland", + "deutschlands", + "development", "dez", "dez.", "dgl", "dgl.", "di", "di.", + "different", "dipl", "dipl.", "dipl.-ing", "dipl.-ing.", "do", "do.", + "don", "dr", "dr.", "du", "du\u2019s", "dv.", + "d\u00e4nemark", "d\u2019", "e", + "e-mail", "e.", "e.V.", "e.d", @@ -699,6 +853,7 @@ "ebr", "ed.", "egr", + "egy", "ehem", "ehem.", "eigentlich", @@ -708,12 +863,17 @@ "eine", "einem", "einen", + "einw", + "eit", "el.", + "els", "em.", "en.", "engl", "engl.", "englisch", + "enhancing", + "ent", "entspr", "entspr.", "ep.", @@ -724,18 +884,23 @@ "erm.", "err", "ers", + "ersten", "er\u2019s", "es", "etc", "etc.", "etr", + "ets", + "europe", "european", "ev", "ev.", "eventuell", "evtl", "evtl.", + "exit", "expertise", + "exposure", "ez.", "e\u2019s", "f", @@ -746,9 +911,14 @@ "fam.", "feb", "feb.", + "fen", + "festgelegter", + "fil", "fond", + "fonds", "fr", "fr.", + "frankreich", "franz\u00f6sisch", "frl", "frl.", @@ -756,11 +926,13 @@ "frz.", "fs.", "fund", + "fu\u0308hrende", "f\u2019m", "g", "g.", "g.m.b", "g.m.b.h.", + "gateway", "geb", "geb.", "gebr", @@ -772,6 +944,10 @@ "gegr\u00fcndet", "gem", "gem.", + "gen", + "geplantes", + "ger", + "gesamtrendite", "gf.", "gfs", "ggf", @@ -780,6 +956,7 @@ "ggfs.", "gg\u00fc", "gg\u00fc.", + "gie", "gl.", "good", "gr.", @@ -789,9 +966,12 @@ "h.", "h.c", "h.c.", + "halten", + "halten-strategie", "hbf", "hbf.", "hd.", + "hed", "hem", "hf.", "hg", @@ -799,8 +979,10 @@ "hil", "hinter", "hinter\u2019m", + "hotels", "hr", "hr.", + "hre", "hrn", "hrn.", "hrsg", @@ -826,6 +1008,8 @@ "ich", "ich\u2019s", "ie.", + "ien", + "ies", "ig.", "ihr", "ihr\u2019s", @@ -834,7 +1018,11 @@ "iii", "iii.", "il.", + "ile", "illustration", + "ime", + "immobilien", + "immobilien-spezialfonds", "in", "in.", "inc", @@ -842,26 +1030,41 @@ "incl", "incl.", "ind", + "ine", + "informationen", "ing", "ing.", + "initiatives", "inkl", "inkl.", "inklusive", + "inrev", "insb", "insb.", "insbesondere", + "investmentmanagers", "investments", + "inw", "io.", "iol", "ion", "ipl", + "irr", "is", "ise", + "isk", "iss", + "ite", + "ith", + "ity", + "itz", + "ium", "iv", "iv.", "j", "j.", + "jahr", + "jahre", "jan", "jan.", "jh", @@ -882,24 +1085,38 @@ "kath", "kath.", "katholisch", + "kaufen", + "ket", + "key", "kl.", "kt.", "l", "l'", "l.", "l.a.", + "langfristig", "lat", "lat.", + "laufzeit", "laut", + "le.g", + "ler", + "level", "lg.", + "lin", "lio", "llg", "llt", "llv", + "london", "lt", "lt.", + "lte", + "ltv", + "lty", "lue", "lv.", + "l\u00e4nderallokation", "l\u2019", "m", "m.", @@ -911,19 +1128,30 @@ "m.m.", "m.sc", "m.sc.", + "mal", + "management", + "manager", + "market", "max", "max.", "maximal", + "maximaler", + "men", + "mer", + "metropolregionen", "mi", "mi.", + "million", "min", "min.", "mind", "mind.", "mindestens", "minimal", + "minor", "mio", "mio.", + "mit", "mo", "mo.", "monatlich", @@ -946,18 +1174,26 @@ "n.r", "n.y.", "n.y.c.", + "nachvermietungsstrategie", + "name", "nat", "nat.", "nc.", "ncl", "nd.", + "nde", + "nds", "ne", "nem", "nen", + "ner", "ng.", "ngl", + "ngs", + "niederlande", "niv", "nkl", + "nor", "nov", "nov.", "nr", @@ -982,23 +1218,32 @@ "o_o", "of", "of.", + "offen", + "offener", "og.", "okt", "okt.", "ol.", "ond", "ood", + "opco", + "ope", "ore", "orig", "orig.", "original", + "orm", "ov.", + "over", "p", "p.", "p.a", "p.a.", "p.s", "p.s.", + "pCo", + "paris", + "pco", "pers", "pers.", "phil", @@ -1008,26 +1253,35 @@ "pl.", "portfolio", "pr.", + "premium", "prof", "prof.", + "profile", + "projects", "pt.", "pw.", "q", "q.", "q.e.d", "q.e.d.", + "quality", "r", "r.", "r.i.p.", "rd.", "re", "re+", + "rechtsform", "red", "red.", + "ren", "rer", "rer.", + "rev", "rig", + "ris", "risikoprofil", + "risk", "rl.", "rm.", "rn.", @@ -1035,6 +1289,7 @@ "rr.", "rs.", "rsg", + "rte", "rz.", "r\u00f6m", "r\u00f6m.", @@ -1050,6 +1305,7 @@ "sa", "sa.", "sb.", + "schweden", "sd.", "sen", "sen.", @@ -1057,10 +1313,13 @@ "sep.", "sept", "sept.", + "set", "sf.", "sg.", "sie", "sie\u2019s", + "sitz", + "skandinavien", "so", "so.", "sog", @@ -1072,28 +1331,43 @@ "ss.", "st", "st.", + "standortaufwertungsstrategie", + "standorte", "std", "std.", "stellv", "stellv.", + "stil", "str", "str.", + "strategie", + "strategy", + "struktur", "sw.", "s\u2019", "t", "t.", + "tactical", "tc.", "td.", "tel", "tel.", + "telefonnummer", + "ten", "ter", + "tes", "th.", "the", + "tig", + "til", + "time", "tl.", + "to", "tr", "tr.", "tsd", "tsd.", + "tur", "t\u00e4gl", "t\u00e4gl.", "t\u00e4glich", @@ -1114,11 +1388,13 @@ "ul.", "un.", "und", + "ung", "univ", "univ.", "unter", "unter\u2019m", "ur.", + "ure", "usf", "usf.", "usw", @@ -1138,7 +1414,11 @@ "v.v", "v_v", "value", + "value-added", + "vel", + "ver", "vergleiche", + "ves", "vgl", "vgl.", "vielleicht", @@ -1149,6 +1429,7 @@ "vm.", "vol", "vol.", + "vom", "vor", "vor\u2019m", "vs", @@ -1157,12 +1438,18 @@ "w", "w.", "wSt", + "way", + "well-established", + "wertstabile", "wir", "wir\u2019s", "wiss", "wiss.", + "with", + "wohnimmobilien", "x", "x'", + "x-xxxx", "x.", "x.X", "x.X.", @@ -1186,16 +1473,23 @@ "x_x", "xd", "xdd", + "xit", "xx", "xx.", + "xx.x", "xxx", "xxx.", "xxxx", + "xxxx)/xxxx", "xxxx+", "xxxx+/xxxx", + "xxxx-xxx", + "xxxx-xxxx", + "xxxx-xxxx-xxx", "xxxx.", "xxxx\u2019x", "xxx\u2019x", + "xx\u0308xxxx", "xx\u2019x", "x\u2019", "x\ufe35x", @@ -1215,6 +1509,13 @@ "z.z.", "z.zt.", "zgl", + "ziel-lty", + "ziel-netto-irr", + "zielanlagestrategie", + "zielregionfen)/jand", + "zielsektoren", + "zielvolumen", + "ziirraiaein", "zt", "zw.", "zzgl", @@ -1236,6 +1537,8 @@ "\u00b0f.", "\u00b0k.", "\u00b0x.", + "\u00dc", + "\u00dcbersicht", "\u00e4", "\u00e4.", "\u00e4gl", @@ -1249,6 +1552,7 @@ "\u00fc", "\u00fc.", "\u00fcber", + "\u00fcbersicht", "\u00fcber\u2019m", "\u0ca0", "\u0ca0_\u0ca0", @@ -1269,6 +1573,8 @@ "\u2019xx", "\u2019xxx", "\u2019\u2019", + "\u201a", + "\u20ac", "\u2501", "\u253b", "\u253b\u2501\u253b", diff --git a/prototypes/fine_tuning_spaCy/output/model-last/meta.json b/prototypes/fine_tuning_spaCy/output/model-last/meta.json index ca58b77..daabc76 100644 --- a/prototypes/fine_tuning_spaCy/output/model-last/meta.json +++ b/prototypes/fine_tuning_spaCy/output/model-last/meta.json @@ -46,7 +46,7 @@ "f":1.0 } }, - "tok2vec_loss":0.000000011, - "ner_loss":0.0000000457 + "tok2vec_loss":0.000000029, + "ner_loss":0.0000000614 } } \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-last/ner/model b/prototypes/fine_tuning_spaCy/output/model-last/ner/model index 4909428..8a0c5ae 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-last/ner/model and b/prototypes/fine_tuning_spaCy/output/model-last/ner/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-last/ner/moves b/prototypes/fine_tuning_spaCy/output/model-last/ner/moves index e27560d..e72ba15 100644 --- a/prototypes/fine_tuning_spaCy/output/model-last/ner/moves +++ b/prototypes/fine_tuning_spaCy/output/model-last/ner/moves @@ -1 +1 @@ -movesx{"0":{},"1":{"RISIKOPROFIL":20},"2":{"RISIKOPROFIL":20},"3":{"RISIKOPROFIL":20},"4":{"RISIKOPROFIL":20,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file +movesx{"0":{},"1":{"RISIKOPROFIL":45},"2":{"RISIKOPROFIL":45},"3":{"RISIKOPROFIL":45},"4":{"RISIKOPROFIL":45,"":1},"5":{"":1}}cfgneg_key \ No newline at end of file diff --git a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model index 0bf1b64..1cfa6a5 100644 Binary files a/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model and b/prototypes/fine_tuning_spaCy/output/model-last/tok2vec/model differ diff --git a/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json b/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json index 4293997..7569f1d 100644 --- a/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json +++ b/prototypes/fine_tuning_spaCy/output/model-last/vocab/strings.json @@ -4,6 +4,8 @@ " ", " ", "\"", + "$", + "%", "'", "''", "'-(", @@ -46,6 +48,8 @@ ")/\u00af", "):", "*", + "+", + ",", "-", "-((", "-))", @@ -100,16 +104,40 @@ ".\u00e4.", "/", "/3", + "/Core+", + "/Xxxx+", + "/core+", "/d", + "/xxxx+", "0", + "0%+", "0.0", "0.o", + "022", + "032", + "034", "0_0", "0_o", "1", + "1.", + "10", + "12", + "2", + "2.", + "20", + "2022", + "2032", + "2034", + "250", "3", + "3.", "33", "333", + "35", + "5", + "50", + "7", + "7,50%+", "8", "8)", "8-", @@ -234,9 +262,20 @@ "Abt.", "Abteilung", "Add", + "Aktueller", + "Allgemeine", + "Amsterdam", + "Anlagestrategien", + "Anlagevehikels", + "Ansprechpartners", + "Antagevehikels", "Apr", "Apr.", "April", + "Art", + "Assets", + "Aufl\u00f6sung", + "Aufwertung", "Aug", "Aug.", "August", @@ -250,6 +289,8 @@ "Bd", "Bd.", "Beispiel", + "Berlin", + "Bestandsentwicklungen", "Betr", "Betr.", "Betreff", @@ -271,6 +312,8 @@ "Chr.", "Cie", "Cie.", + "Cities", + "Closings", "Co", "Co.", "Core", @@ -279,12 +322,15 @@ "D", "D.", "D.C.", + "Deutschland", + "Deutschlands", "Dez", "Dez.", "Dezember", "Di", "Di.", "Dienstag", + "Different", "Dipl", "Dipl.", "Dipl.-Ing", @@ -294,9 +340,14 @@ "Donnerstag", "Dr", "Dr.", + "D\u00e4nemark", "E", + "E-Mail", "E.", + "Einw", + "Europe", "European", + "Exit", "F", "F.", "FIL", @@ -310,20 +361,28 @@ "Februar", "Firma", "Fond", + "Fonds", "Fr", "Fr.", + "Frankreich", "Frau", "Frl", "Frl.", "Fr\u00e4ulein", + "Fu\u0308hrende", "G", "G.", "G.m.b", "G.m.b.H.", + "Gateway", "Gebr", "Gebr.", + "Geplantes", + "Gesamtrendite", "H", "H.", + "Halten", + "Halten-Strategie", "Hauptbahnhof", "Hbf", "Hbf.", @@ -343,13 +402,21 @@ "II.", "III", "III.", + "INREV", + "IRR", "IV", "IV.", + "Immobilien", + "Immobilien-Spezialfonds", "Inc", "Inc.", + "Informationen", "Ing", "Ing.", + "Investmentmanagers", "J", + "Jahr", + "Jahre", "Jahrhundert", "Jan", "Jan.", @@ -369,16 +436,26 @@ "K", "K.", "K.O.", + "Kaufen", + "Key", "L", "L'", "L.A.", + "LTV", + "LTY", + "Laufzeit", + "London", + "L\u00e4nderallokation", "L\u2019", "M", "M.", "M.A.", "M.Sc", "M.Sc.", + "Manager", + "Maximaler", "Mehrwertsteuer", + "Metropolregionen", "Mi", "Mi.", "Milliarde", @@ -403,6 +480,9 @@ "N", "N.Y.", "N.Y.C.", + "Nachvermietungsstrategie", + "Name", + "Niederlande", "Nov", "Nov.", "November", @@ -416,14 +496,18 @@ "O.o", "O_O", "O_o", + "Offen", + "Offener", "Okt", "Okt.", "Oktober", + "OpCo", "Orig", "Orig.", "Original", "P", "P.S.", + "Paris", "Pkt", "Pkt.", "Prof", @@ -434,32 +518,44 @@ "R.", "R.I.P.", "RE", + "REV", "RISIKOPROFIL", "ROOT", + "Rechtsform", "Red", "Red.", "Redaktion", + "Risikoprofil", + "Risk", "S", "S'", "Sa", "Sa.", "Samstag", "Sc.", + "Schweden", "Sep", "Sep.", "Sept", "Sept.", "September", + "Sitz", + "Skandinavien", "So", "So.", "Sonntag", "St", "St.", + "Standortaufwertungsstrategie", + "Standorte", "Std", "Std.", + "Stil", "Str", "Str.", + "Strategie", "Stra\u00dfe", + "Struktur", "Stunde", "S\u2019", "T", @@ -468,6 +564,7 @@ "Tel", "Tel.", "Telefon", + "Telefonnummer", "The", "Tr", "Tr.", @@ -488,8 +585,12 @@ "Value", "Vol", "Vol.", + "W", + "Wertstabile", + "Wohnimmobilien", "X'", "X++", + "X-Xxxx", "X.", "X.X", "X.X.", @@ -518,13 +619,25 @@ "Xxxx", "Xxxx+", "Xxxx+/Xxxxx", + "Xxxx-XXX", + "Xxxx-Xxxxx-XXX", "Xxxx.", "Xxxx.-Xxx", "Xxxx.-Xxx.", "Xxxxx", + "Xxxxx)/Xxxx", + "Xxxxx-Xxxxx", + "Xx\u0308xxxx", "X\u2019", "Z", "Z.", + "Ziel-LTY", + "Ziel-Netto-IRR", + "Zielanlagestrategie", + "Zielregionfen)/Jand", + "Zielsektoren", + "Zielvolumen", + "Ziirraiaein", "Zt", "Zt.", "[", @@ -578,21 +691,41 @@ "add", "adv", "adv.", + "ahr", + "ail", + "aiming", + "aktueller", "al", "al.", "allg", "allg.", "allgemein", + "allgemeine", "am.", + "ame", + "amsterdam", "an.", + "and", + "anlagestrategien", + "anlagevehikels", + "ansprechpartners", + "antagevehikels", "apr", "apr.", + "ark", + "art", + "asset", + "assets", + "at", "at.", "ath", "auf", + "aufl\u00f6sung", + "aufwertung", "auf\u2019m", "aug", "aug.", + "ausgew\u00e4hlte", "ax.", "b", "b.", @@ -604,6 +737,8 @@ "bd.", "beispielsweise", "ber", + "berlin", + "bestandsentwicklungen", "betr", "betr.", "beziehungsweise", @@ -632,23 +767,29 @@ "c/o", "ca", "ca.", + "cal", "cdu", "cdu/csu", "ce>", "chr", "chr.", + "cht", "cie", "cie.", + "cities", "cl.", + "closings", "co", "co.", "core", "core+", "core+/value", "csu", + "cts", "d", "d'", "d)", + "d,dd%+", "d-", "d-)", "d-X", @@ -661,30 +802,43 @@ "dX", "d_d", "d_x", + "dam", "dd", "ddd", + "dddd", "de", + "ded", "dem", + "den", + "der", "dergleichen", + "des", + "deutschland", + "deutschlands", + "development", "dez", "dez.", "dgl", "dgl.", "di", "di.", + "different", "dipl", "dipl.", "dipl.-ing", "dipl.-ing.", "do", "do.", + "don", "dr", "dr.", "du", "du\u2019s", "dv.", + "d\u00e4nemark", "d\u2019", "e", + "e-mail", "e.", "e.V.", "e.d", @@ -699,6 +853,7 @@ "ebr", "ed.", "egr", + "egy", "ehem", "ehem.", "eigentlich", @@ -708,12 +863,17 @@ "eine", "einem", "einen", + "einw", + "eit", "el.", + "els", "em.", "en.", "engl", "engl.", "englisch", + "enhancing", + "ent", "entspr", "entspr.", "ep.", @@ -724,18 +884,23 @@ "erm.", "err", "ers", + "ersten", "er\u2019s", "es", "etc", "etc.", "etr", + "ets", + "europe", "european", "ev", "ev.", "eventuell", "evtl", "evtl.", + "exit", "expertise", + "exposure", "ez.", "e\u2019s", "f", @@ -746,9 +911,14 @@ "fam.", "feb", "feb.", + "fen", + "festgelegter", + "fil", "fond", + "fonds", "fr", "fr.", + "frankreich", "franz\u00f6sisch", "frl", "frl.", @@ -756,11 +926,13 @@ "frz.", "fs.", "fund", + "fu\u0308hrende", "f\u2019m", "g", "g.", "g.m.b", "g.m.b.h.", + "gateway", "geb", "geb.", "gebr", @@ -772,6 +944,10 @@ "gegr\u00fcndet", "gem", "gem.", + "gen", + "geplantes", + "ger", + "gesamtrendite", "gf.", "gfs", "ggf", @@ -780,6 +956,7 @@ "ggfs.", "gg\u00fc", "gg\u00fc.", + "gie", "gl.", "good", "gr.", @@ -789,9 +966,12 @@ "h.", "h.c", "h.c.", + "halten", + "halten-strategie", "hbf", "hbf.", "hd.", + "hed", "hem", "hf.", "hg", @@ -799,8 +979,10 @@ "hil", "hinter", "hinter\u2019m", + "hotels", "hr", "hr.", + "hre", "hrn", "hrn.", "hrsg", @@ -826,6 +1008,8 @@ "ich", "ich\u2019s", "ie.", + "ien", + "ies", "ig.", "ihr", "ihr\u2019s", @@ -834,7 +1018,11 @@ "iii", "iii.", "il.", + "ile", "illustration", + "ime", + "immobilien", + "immobilien-spezialfonds", "in", "in.", "inc", @@ -842,26 +1030,41 @@ "incl", "incl.", "ind", + "ine", + "informationen", "ing", "ing.", + "initiatives", "inkl", "inkl.", "inklusive", + "inrev", "insb", "insb.", "insbesondere", + "investmentmanagers", "investments", + "inw", "io.", "iol", "ion", "ipl", + "irr", "is", "ise", + "isk", "iss", + "ite", + "ith", + "ity", + "itz", + "ium", "iv", "iv.", "j", "j.", + "jahr", + "jahre", "jan", "jan.", "jh", @@ -882,24 +1085,38 @@ "kath", "kath.", "katholisch", + "kaufen", + "ket", + "key", "kl.", "kt.", "l", "l'", "l.", "l.a.", + "langfristig", "lat", "lat.", + "laufzeit", "laut", + "le.g", + "ler", + "level", "lg.", + "lin", "lio", "llg", "llt", "llv", + "london", "lt", "lt.", + "lte", + "ltv", + "lty", "lue", "lv.", + "l\u00e4nderallokation", "l\u2019", "m", "m.", @@ -911,19 +1128,30 @@ "m.m.", "m.sc", "m.sc.", + "mal", + "management", + "manager", + "market", "max", "max.", "maximal", + "maximaler", + "men", + "mer", + "metropolregionen", "mi", "mi.", + "million", "min", "min.", "mind", "mind.", "mindestens", "minimal", + "minor", "mio", "mio.", + "mit", "mo", "mo.", "monatlich", @@ -946,18 +1174,26 @@ "n.r", "n.y.", "n.y.c.", + "nachvermietungsstrategie", + "name", "nat", "nat.", "nc.", "ncl", "nd.", + "nde", + "nds", "ne", "nem", "nen", + "ner", "ng.", "ngl", + "ngs", + "niederlande", "niv", "nkl", + "nor", "nov", "nov.", "nr", @@ -982,23 +1218,32 @@ "o_o", "of", "of.", + "offen", + "offener", "og.", "okt", "okt.", "ol.", "ond", "ood", + "opco", + "ope", "ore", "orig", "orig.", "original", + "orm", "ov.", + "over", "p", "p.", "p.a", "p.a.", "p.s", "p.s.", + "pCo", + "paris", + "pco", "pers", "pers.", "phil", @@ -1008,26 +1253,35 @@ "pl.", "portfolio", "pr.", + "premium", "prof", "prof.", + "profile", + "projects", "pt.", "pw.", "q", "q.", "q.e.d", "q.e.d.", + "quality", "r", "r.", "r.i.p.", "rd.", "re", "re+", + "rechtsform", "red", "red.", + "ren", "rer", "rer.", + "rev", "rig", + "ris", "risikoprofil", + "risk", "rl.", "rm.", "rn.", @@ -1035,6 +1289,7 @@ "rr.", "rs.", "rsg", + "rte", "rz.", "r\u00f6m", "r\u00f6m.", @@ -1050,6 +1305,7 @@ "sa", "sa.", "sb.", + "schweden", "sd.", "sen", "sen.", @@ -1057,10 +1313,13 @@ "sep.", "sept", "sept.", + "set", "sf.", "sg.", "sie", "sie\u2019s", + "sitz", + "skandinavien", "so", "so.", "sog", @@ -1072,28 +1331,43 @@ "ss.", "st", "st.", + "standortaufwertungsstrategie", + "standorte", "std", "std.", "stellv", "stellv.", + "stil", "str", "str.", + "strategie", + "strategy", + "struktur", "sw.", "s\u2019", "t", "t.", + "tactical", "tc.", "td.", "tel", "tel.", + "telefonnummer", + "ten", "ter", + "tes", "th.", "the", + "tig", + "til", + "time", "tl.", + "to", "tr", "tr.", "tsd", "tsd.", + "tur", "t\u00e4gl", "t\u00e4gl.", "t\u00e4glich", @@ -1114,11 +1388,13 @@ "ul.", "un.", "und", + "ung", "univ", "univ.", "unter", "unter\u2019m", "ur.", + "ure", "usf", "usf.", "usw", @@ -1138,7 +1414,11 @@ "v.v", "v_v", "value", + "value-added", + "vel", + "ver", "vergleiche", + "ves", "vgl", "vgl.", "vielleicht", @@ -1149,6 +1429,7 @@ "vm.", "vol", "vol.", + "vom", "vor", "vor\u2019m", "vs", @@ -1157,12 +1438,18 @@ "w", "w.", "wSt", + "way", + "well-established", + "wertstabile", "wir", "wir\u2019s", "wiss", "wiss.", + "with", + "wohnimmobilien", "x", "x'", + "x-xxxx", "x.", "x.X", "x.X.", @@ -1186,16 +1473,23 @@ "x_x", "xd", "xdd", + "xit", "xx", "xx.", + "xx.x", "xxx", "xxx.", "xxxx", + "xxxx)/xxxx", "xxxx+", "xxxx+/xxxx", + "xxxx-xxx", + "xxxx-xxxx", + "xxxx-xxxx-xxx", "xxxx.", "xxxx\u2019x", "xxx\u2019x", + "xx\u0308xxxx", "xx\u2019x", "x\u2019", "x\ufe35x", @@ -1215,6 +1509,13 @@ "z.z.", "z.zt.", "zgl", + "ziel-lty", + "ziel-netto-irr", + "zielanlagestrategie", + "zielregionfen)/jand", + "zielsektoren", + "zielvolumen", + "ziirraiaein", "zt", "zw.", "zzgl", @@ -1236,6 +1537,8 @@ "\u00b0f.", "\u00b0k.", "\u00b0x.", + "\u00dc", + "\u00dcbersicht", "\u00e4", "\u00e4.", "\u00e4gl", @@ -1249,6 +1552,7 @@ "\u00fc", "\u00fc.", "\u00fcber", + "\u00fcbersicht", "\u00fcber\u2019m", "\u0ca0", "\u0ca0_\u0ca0", @@ -1269,6 +1573,8 @@ "\u2019xx", "\u2019xxx", "\u2019\u2019", + "\u201a", + "\u20ac", "\u2501", "\u253b", "\u253b\u2501\u253b", diff --git a/prototypes/fine_tuning_spaCy/test_model.py b/prototypes/fine_tuning_spaCy/test_model.py index 8be6f35..7286d43 100644 --- a/prototypes/fine_tuning_spaCy/test_model.py +++ b/prototypes/fine_tuning_spaCy/test_model.py @@ -4,7 +4,7 @@ import json from pathlib import Path nlp = spacy.load("output/model-last") -input_pdf = Path("../../pitch-books/Pitchbook 1.pdf") +input_pdf = Path("../ocr/output/Pitchbook 1-OCR.pdf") doc = fitz.open(input_pdf) diff --git a/prototypes/fine_tuning_spaCy/training_data.py b/prototypes/fine_tuning_spaCy/training_data.py index a491f6b..ed2e4d3 100644 --- a/prototypes/fine_tuning_spaCy/training_data.py +++ b/prototypes/fine_tuning_spaCy/training_data.py @@ -1,26 +1,66 @@ TRAINING_DATA = [ ( - "Core",{"entities":[[0,4,"RISIKOPROFIL"]]}, + "Core", + {"entities":[[0,4,"RISIKOPROFIL"]]}, ), ( - "Core+",{"entities":[[0,5,"RISIKOPROFIL"]]}, + "Core+", + {"entities":[[0,5,"RISIKOPROFIL"]]}, ), ( - "Core/Core+",{"entities":[[0,10,"RISIKOPROFIL"]]}, + "Core/Core+", + {"entities":[[0,10,"RISIKOPROFIL"]]}, ), ( - "Value Add",{"entities":[[0,9,"RISIKOPROFIL"]]}, + "Value Add", + {"entities":[[0,9,"RISIKOPROFIL"]]}, ), ( - "Core/Value Add",{"entities":[[0,14,"RISIKOPROFIL"]]}, + "Core/Value Add", + {"entities":[[0,14,"RISIKOPROFIL"]]}, ), ( - "Core+/Value Add",{"entities":[[0,15,"RISIKOPROFIL"]]}, + "Core+/Value Add", + {"entities":[[0,15,"RISIKOPROFIL"]]}, ), ( - "Core/Core+/Value Add",{"entities":[[0,20,"RISIKOPROFIL"]]}, + "Core/Core+/Value Add", + {"entities":[[0,20,"RISIKOPROFIL"]]}, ), ( - "The RE portfolio of the fund is a good illustration of Fond expertise in European core/core+ investments .",{"entities":[[82,92,"RISIKOPROFIL"]]}, + "The RE portfolio of the fund is a good illustration of Fond expertise in European core/core+ investments .", + {"entities":[[82,92,"RISIKOPROFIL"]]}, ), + ( + "Risk level: Core/Core+", + {"entities":[[12,22,"RISIKOPROFIL"]]}, + ), + ( + "Different risk profile (core, core+, value-added)", + {"entities":[[24,48,"RISIKOPROFIL"]]}, + ), + ( + "Core/Core+ with OpCo premium", + {"entities":[[0,10,"RISIKOPROFIL"]]}, + ), + ( + "Core /Core+ Assets, well-established = Key Gateway Cities in Europe le.g. hotels in the market with minor asset London, Paris, Amsterdam, Berlin] management initiatives", + {"entities":[[0,11,"RISIKOPROFIL"]]}, + ), + ( + "Risikoprofil: Core, Core +", + {"entities":[[14,26,"RISIKOPROFIL"]]}, + ), + ( + "Name des Fonds Name des Investmentmanagers Allgemeine Informationen Name des Ansprechpartners Telefonnummer des Ansprechpartners E-Mail des Ansprechpartners Art des Anlagevehikels Struktur des Anlagevehikels Sitz des Anlagevehikels Struktur des Antagevehikels vom Manager festgelegter Stil Rechtsform Jahr des ersten Closings Laufzeit Geplantes Jahr der Auflösung Ziel-Netto-IRR / Gesamtrendite* Zielvolumen des Anlagevehikels Ziel-LTY ‚Aktueller LTV Ziirraiaein Maximaler LTV Zielregionfen)/Jand Zielsektoren Zielanlagestrategie INREV Fonds Offen Deutschland Core, Core + Offener Immobilien-Spezialfonds 2022 10 - 12 Jahre 2032 - 2034 7,50%+ 250 Mio. € 20% 0% 20% Führende Metropolregionen Deutschlands und ausgewählte Standorte >50T Einw. Wohnimmobilien Wertstabile Wohnimmobilien (mit Bestandsentwicklungen)", + {"entities":[[560,572,"RISIKOPROFIL"]]}, + ), + ( + "Core/Core+ strategy, with tactical exposure to development projects aiming at enhancing the quality of the portfolio over time", + {"entities":[[0,10,"RISIKOPROFIL"]]}, + ), + ( + "Strategie - Übersicht Risikoprofil Core+ Halten-Strategie Kaufen — Halten (langfristig) — Exit 1. Nachvermietungsstrategie Anlagestrategien 2. Standortaufwertungsstrategie 3. Strategie der Aufwertung der Immobilien Niederlande (max. 35 %) Länderallokation Frankreich (max. 35 %) (in % vom Zielvolumen) Skandinavien (Schweden, Dänemark) (max. 35 %) Deutschland (<= 10 %)", + {"entities":[[35,40,"RISIKOPROFIL"]]}, + ) ] \ No newline at end of file