repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
Mirrored from https://github.com/benmaster82/Kwipu
stars
latest
clone command
git clone gitlawb://did:key:z6MkqRzA...RfoM/benmaster82-Kwi...git clone gitlawb://did:key:z6MkqRzA.../benmaster82-Kwi...908f0e4eAdd MCP badge18d ago| #1 | """ |
| #2 | Multilingual configuration for Geode Graph. |
| #3 | |
| #4 | Supported languages: Italian, English, French, German, Spanish, Portuguese. |
| #5 | Auto-detection based on stopwords and patterns. |
| #6 | """ |
| #7 | |
| #8 | import re |
| #9 | import unicodedata |
| #10 | from collections import Counter |
| #11 | |
| #12 | # ========================================== |
| #13 | # STOPWORDS BY LANGUAGE |
| #14 | # ========================================== |
| #15 | STOPWORDS = { |
| #16 | "it": frozenset( |
| #17 | "il lo la i gli le un uno una di del dello della dei degli delle " |
| #18 | "a al allo alla ai agli alle da dal dallo dalla dai dagli dalle " |
| #19 | "in nel nello nella nei negli nelle con su sul sullo sulla sui sugli sulle " |
| #20 | "per tra fra e o ma che chi cui non ne se si come dove quando quanto " |
| #21 | "anche ancora piu molto questo quello sono stato essere avere fare " |
| #22 | "ha ho hanno era erano suo sua suoi sue loro tutto tutti tutta tutte " |
| #23 | "altro altri altra altre stesso stessa stessi stesse ogni quale quali " |
| #24 | "dopo prima durante senza verso fino sopra sotto dentro fuori " |
| #25 | "poi gia qui la li ora mai sempre solo proprio cosi perche".split() |
| #26 | ), |
| #27 | "en": frozenset( |
| #28 | "the a an and or but not is are was were be been being have has had " |
| #29 | "do does did will would shall should can could may might must " |
| #30 | "i me my we our you your he him his she her it its they them their " |
| #31 | "this that these those what which who whom how where when why " |
| #32 | "in on at to for from by with of about into through during before after " |
| #33 | "above below between under over up down out off then than so if " |
| #34 | "all each every both few more most other some such no nor too very " |
| #35 | "just also back only own same here there again further once".split() |
| #36 | ), |
| #37 | "fr": frozenset( |
| #38 | "le la les un une des de du au aux en dans par pour sur avec sans " |
| #39 | "ce cette ces son sa ses leur leurs mon ma mes ton ta tes notre votre " |
| #40 | "je tu il elle nous vous ils elles on ne pas plus que qui quoi dont ou " |
| #41 | "et ou mais donc car ni si comme quand comment pourquoi " |
| #42 | "est sont etait etaient etre avoir fait faire " |
| #43 | "tout tous toute toutes autre autres meme aussi encore bien tres " |
| #44 | "ici peu beaucoup trop assez".split() |
| #45 | ), |
| #46 | "de": frozenset( |
| #47 | "der die das ein eine einer eines einem einen dem den des " |
| #48 | "und oder aber nicht ist sind war waren sein haben hat hatte " |
| #49 | "ich du er sie es wir ihr sie mein dein sein ihr unser euer " |
| #50 | "in an auf aus bei mit nach von zu um durch fuer ueber unter " |
| #51 | "was wer wie wo wann warum welch welche welcher welches " |
| #52 | "auch noch schon nur sehr viel mehr als wenn dann also " |
| #53 | "kein keine keiner keines diesem dieser dieses diese " |
| #54 | "alle alles ander andere anderer anderes".split() |
| #55 | ), |
| #56 | "es": frozenset( |
| #57 | "el la los las un una unos unas de del al en por para con sin sobre " |
| #58 | "entre hasta desde durante ante bajo contra segun " |
| #59 | "yo tu el ella nosotros vosotros ellos ellas usted ustedes " |
| #60 | "mi tu su nuestro vuestro sus me te se nos os le les lo " |
| #61 | "que quien cual cuyo donde cuando como cuanto porque " |
| #62 | "y o pero sino ni mas menos tan tanto como si no " |
| #63 | "es son era eran ser estar haber tener hacer " |
| #64 | "todo todos toda todas otro otros otra otras mismo misma cada".split() |
| #65 | ), |
| #66 | "pt": frozenset( |
| #67 | "o a os as um uma uns umas de do da dos das em no na nos nas " |
| #68 | "por para com sem sobre entre ate desde durante " |
| #69 | "eu tu ele ela nos vos eles elas voce voces " |
| #70 | "meu minha seu sua nosso nossa seus suas me te se lhe " |
| #71 | "que quem qual onde quando como quanto porque " |
| #72 | "e ou mas nem se nao mais menos tao tanto como " |
| #73 | "ser estar ter haver fazer ir poder dever " |
| #74 | "todo todos toda todas outro outros outra outras mesmo mesma cada".split() |
| #75 | ), |
| #76 | } |
| #77 | |
| #78 | # ========================================== |
| #79 | # MONTH NAMES BY LANGUAGE |
| #80 | # ========================================== |
| #81 | MONTH_NAMES = { |
| #82 | "it": { |
| #83 | "gennaio": "01", "febbraio": "02", "marzo": "03", "aprile": "04", |
| #84 | "maggio": "05", "giugno": "06", "luglio": "07", "agosto": "08", |
| #85 | "settembre": "09", "ottobre": "10", "novembre": "11", "dicembre": "12", |
| #86 | }, |
| #87 | "en": { |
| #88 | "january": "01", "february": "02", "march": "03", "april": "04", |
| #89 | "may": "05", "june": "06", "july": "07", "august": "08", |
| #90 | "september": "09", "october": "10", "november": "11", "december": "12", |
| #91 | }, |
| #92 | "fr": { |
| #93 | "janvier": "01", "fevrier": "02", "mars": "03", "avril": "04", |
| #94 | "mai": "05", "juin": "06", "juillet": "07", "aout": "08", |
| #95 | "septembre": "09", "octobre": "10", "novembre": "11", "decembre": "12", |
| #96 | }, |
| #97 | "de": { |
| #98 | "januar": "01", "februar": "02", "maerz": "03", "april": "04", |
| #99 | "mai": "05", "juni": "06", "juli": "07", "august": "08", |
| #100 | "september": "09", "oktober": "10", "november": "11", "dezember": "12", |
| #101 | }, |
| #102 | "es": { |
| #103 | "enero": "01", "febrero": "02", "marzo": "03", "abril": "04", |
| #104 | "mayo": "05", "junio": "06", "julio": "07", "agosto": "08", |
| #105 | "septiembre": "09", "octubre": "10", "noviembre": "11", "diciembre": "12", |
| #106 | }, |
| #107 | "pt": { |
| #108 | "janeiro": "01", "fevereiro": "02", "marco": "03", "abril": "04", |
| #109 | "maio": "05", "junho": "06", "julho": "07", "agosto": "08", |
| #110 | "setembro": "09", "outubro": "10", "novembro": "11", "dezembro": "12", |
| #111 | }, |
| #112 | } |
| #113 | |
| #114 | # Mappa inversa: nome mese -> lingua (per detection) |
| #115 | _ALL_MONTHS = {} |
| #116 | for lang, months in MONTH_NAMES.items(): |
| #117 | for month_name in months: |
| #118 | _ALL_MONTHS[month_name] = lang |
| #119 | |
| #120 | # ========================================== |
| #121 | # TEMPORAL KEYWORDS BY LANGUAGE |
| #122 | # ========================================== |
| #123 | TEMPORAL_KEYWORDS = { |
| #124 | "it": { |
| #125 | "riunione", "riunioni", "meeting", "kickoff", "review", |
| #126 | "semestrale", "decisione", "decisioni", "risultato", "risultati", |
| #127 | "milestone", "avviato", "avviata", "completato", "completata", |
| #128 | "progresso", "scadenza", "consegna", "approvato", "approvata", |
| #129 | }, |
| #130 | "en": { |
| #131 | "meeting", "meetings", "kickoff", "review", "sprint", |
| #132 | "decision", "decisions", "result", "results", "outcome", |
| #133 | "milestone", "started", "completed", "finished", "delivered", |
| #134 | "progress", "deadline", "delivery", "approved", "launched", |
| #135 | }, |
| #136 | "fr": { |
| #137 | "reunion", "reunions", "revue", "bilan", "lancement", |
| #138 | "decision", "decisions", "resultat", "resultats", |
| #139 | "jalon", "demarre", "termine", "livre", "approuve", |
| #140 | "progres", "echeance", "livraison", |
| #141 | }, |
| #142 | "de": { |
| #143 | "besprechung", "sitzung", "treffen", "review", "kickoff", |
| #144 | "entscheidung", "entscheidungen", "ergebnis", "ergebnisse", |
| #145 | "meilenstein", "gestartet", "abgeschlossen", "geliefert", |
| #146 | "fortschritt", "frist", "lieferung", "genehmigt", |
| #147 | }, |
| #148 | "es": { |
| #149 | "reunion", "reuniones", "revision", "lanzamiento", |
| #150 | "decision", "decisiones", "resultado", "resultados", |
| #151 | "hito", "iniciado", "completado", "entregado", "aprobado", |
| #152 | "progreso", "plazo", "entrega", |
| #153 | }, |
| #154 | "pt": { |
| #155 | "reuniao", "reunioes", "revisao", "lancamento", |
| #156 | "decisao", "decisoes", "resultado", "resultados", |
| #157 | "marco", "iniciado", "concluido", "entregue", "aprovado", |
| #158 | "progresso", "prazo", "entrega", |
| #159 | }, |
| #160 | } |
| #161 | |
| #162 | # Set unificato di tutte le keyword temporali (per matching cross-lingua) |
| #163 | ALL_TEMPORAL_KEYWORDS = set() |
| #164 | for kws in TEMPORAL_KEYWORDS.values(): |
| #165 | ALL_TEMPORAL_KEYWORDS.update(kws) |
| #166 | |
| #167 | # ========================================== |
| #168 | # RELATION PATTERNS BY LANGUAGE (wikilink inference) |
| #169 | # ========================================== |
| #170 | RELATION_PATTERNS = { |
| #171 | "it": [ |
| #172 | (r"responsabile\s+(di|del|della|dello|degli|delle)", "E' responsabile di"), |
| #173 | (r"coordinat[oa]\s+(da|di|del)", "E' coordinato da"), |
| #174 | (r"sviluppat[oa]\s+(da|di)", "E' sviluppato da"), |
| #175 | (r"gestit[oa]\s+(da|di)", "E' gestito da"), |
| #176 | (r"lavora\s+(per|a|al|alla|con|presso|in)", "Lavora presso"), |
| #177 | (r"collabora\s+(con|strettamente)", "Collabora con"), |
| #178 | (r"co-?autore", "E' co-autore con"), |
| #179 | (r"pubblicazion[ei]", "Ha pubblicazione"), |
| #180 | (r"brevetto", "Ha brevetto con"), |
| #181 | (r"finanzia(mento|to)", "E' finanziato da"), |
| #182 | (r"partner", "E' partner di"), |
| #183 | (r"membro\s+(del|di)", "E' membro di"), |
| #184 | (r"supervisio?n[ae]", "Supervisiona"), |
| #185 | (r"accesso", "Ha accesso a"), |
| #186 | (r"utilizz[ao]", "Utilizza"), |
| #187 | (r"basat[oa]\s+su", "E' basato su"), |
| #188 | (r"addestrat[oa]\s+su", "E' addestrato su"), |
| #189 | (r"ospita", "Ospita"), |
| #190 | (r"nell'ambito\s+(del|di)", "Fa parte di"), |
| #191 | (r"avviat[oa]", "E' avviato da"), |
| #192 | (r"test\s+(su|con|del)", "Testa con"), |
| #193 | ], |
| #194 | "en": [ |
| #195 | (r"responsible\s+for", "Is responsible for"), |
| #196 | (r"coordinated\s+by", "Is coordinated by"), |
| #197 | (r"developed\s+by", "Is developed by"), |
| #198 | (r"managed\s+by", "Is managed by"), |
| #199 | (r"works\s+(for|at|with|in)", "Works at"), |
| #200 | (r"collaborates?\s+with", "Collaborates with"), |
| #201 | (r"co-?author", "Is co-author with"), |
| #202 | (r"publication", "Has publication"), |
| #203 | (r"patent", "Has patent with"), |
| #204 | (r"fund(ed|ing)", "Is funded by"), |
| #205 | (r"partner", "Is partner of"), |
| #206 | (r"member\s+of", "Is member of"), |
| #207 | (r"supervis(es|ed|ing)", "Supervises"), |
| #208 | (r"access\s+to", "Has access to"), |
| #209 | (r"uses?", "Uses"), |
| #210 | (r"based\s+on", "Is based on"), |
| #211 | (r"trained\s+on", "Is trained on"), |
| #212 | (r"hosts?", "Hosts"), |
| #213 | (r"part\s+of", "Is part of"), |
| #214 | (r"launched", "Is launched by"), |
| #215 | (r"tests?\s+(on|with)", "Tests with"), |
| #216 | ], |
| #217 | "fr": [ |
| #218 | (r"responsable\s+(de|du|des)", "Est responsable de"), |
| #219 | (r"coordonne\s+par", "Est coordonne par"), |
| #220 | (r"developpe\s+par", "Est developpe par"), |
| #221 | (r"gere\s+par", "Est gere par"), |
| #222 | (r"travaille\s+(pour|a|avec|chez)", "Travaille pour"), |
| #223 | (r"collabore\s+avec", "Collabore avec"), |
| #224 | (r"co-?auteur", "Est co-auteur avec"), |
| #225 | (r"publication", "A une publication"), |
| #226 | (r"brevet", "A un brevet avec"), |
| #227 | (r"financ(e|ement)", "Est finance par"), |
| #228 | (r"partenaire", "Est partenaire de"), |
| #229 | (r"membre\s+(de|du)", "Est membre de"), |
| #230 | (r"supervis(e|ion)", "Supervise"), |
| #231 | (r"acces", "A acces a"), |
| #232 | (r"utilis(e|ation)", "Utilise"), |
| #233 | (r"base\s+sur", "Est base sur"), |
| #234 | (r"entraine\s+sur", "Est entraine sur"), |
| #235 | (r"heberge", "Heberge"), |
| #236 | (r"dans\s+le\s+cadre\s+(de|du)", "Fait partie de"), |
| #237 | (r"lance", "Est lance par"), |
| #238 | (r"test(e|s)\s+(sur|avec)", "Teste avec"), |
| #239 | ], |
| #240 | "de": [ |
| #241 | (r"verantwortlich\s+fuer", "Ist verantwortlich fuer"), |
| #242 | (r"koordiniert\s+von", "Wird koordiniert von"), |
| #243 | (r"entwickelt\s+von", "Wird entwickelt von"), |
| #244 | (r"verwaltet\s+von", "Wird verwaltet von"), |
| #245 | (r"arbeitet\s+(fuer|bei|mit|in)", "Arbeitet bei"), |
| #246 | (r"zusammenarbeit\s+mit", "Arbeitet zusammen mit"), |
| #247 | (r"co-?autor", "Ist Co-Autor mit"), |
| #248 | (r"publikation", "Hat Publikation"), |
| #249 | (r"patent", "Hat Patent mit"), |
| #250 | (r"finanzier(t|ung)", "Wird finanziert von"), |
| #251 | (r"partner", "Ist Partner von"), |
| #252 | (r"mitglied", "Ist Mitglied von"), |
| #253 | (r"betreu(t|ung)", "Betreut"), |
| #254 | (r"zugang", "Hat Zugang zu"), |
| #255 | (r"verwend(et|ung)", "Verwendet"), |
| #256 | (r"basiert\s+auf", "Basiert auf"), |
| #257 | (r"trainiert\s+auf", "Trainiert auf"), |
| #258 | (r"beherbergt", "Beherbergt"), |
| #259 | (r"im\s+rahmen\s+(von|des)", "Ist Teil von"), |
| #260 | (r"gestartet", "Wird gestartet von"), |
| #261 | (r"test(et|s)\s+(auf|mit)", "Testet mit"), |
| #262 | ], |
| #263 | "es": [ |
| #264 | (r"responsable\s+(de|del)", "Es responsable de"), |
| #265 | (r"coordinado\s+por", "Es coordinado por"), |
| #266 | (r"desarrollado\s+por", "Es desarrollado por"), |
| #267 | (r"gestionado\s+por", "Es gestionado por"), |
| #268 | (r"trabaja\s+(para|en|con)", "Trabaja en"), |
| #269 | (r"colabora\s+con", "Colabora con"), |
| #270 | (r"co-?autor", "Es co-autor con"), |
| #271 | (r"publicacion", "Tiene publicacion"), |
| #272 | (r"patente", "Tiene patente con"), |
| #273 | (r"financiad[oa]", "Es financiado por"), |
| #274 | (r"socio", "Es socio de"), |
| #275 | (r"miembro\s+(de|del)", "Es miembro de"), |
| #276 | (r"supervis(a|ion)", "Supervisa"), |
| #277 | (r"acceso", "Tiene acceso a"), |
| #278 | (r"utiliza", "Utiliza"), |
| #279 | (r"basado\s+en", "Esta basado en"), |
| #280 | (r"entrenado\s+en", "Esta entrenado en"), |
| #281 | (r"aloja", "Aloja"), |
| #282 | (r"en\s+el\s+marco\s+de", "Forma parte de"), |
| #283 | (r"lanzado", "Es lanzado por"), |
| #284 | (r"prueba\s+(en|con)", "Prueba con"), |
| #285 | ], |
| #286 | "pt": [ |
| #287 | (r"responsavel\s+(por|pelo|pela)", "E responsavel por"), |
| #288 | (r"coordenado\s+por", "E coordenado por"), |
| #289 | (r"desenvolvido\s+por", "E desenvolvido por"), |
| #290 | (r"gerenciado\s+por", "E gerenciado por"), |
| #291 | (r"trabalha\s+(para|em|com)", "Trabalha em"), |
| #292 | (r"colabora\s+com", "Colabora com"), |
| #293 | (r"co-?autor", "E co-autor com"), |
| #294 | (r"publicacao", "Tem publicacao"), |
| #295 | (r"patente", "Tem patente com"), |
| #296 | (r"financiad[oa]", "E financiado por"), |
| #297 | (r"parceiro", "E parceiro de"), |
| #298 | (r"membro\s+(de|do|da)", "E membro de"), |
| #299 | (r"supervis(a|ao)", "Supervisiona"), |
| #300 | (r"acesso", "Tem acesso a"), |
| #301 | (r"utiliza", "Utiliza"), |
| #302 | (r"baseado\s+em", "E baseado em"), |
| #303 | (r"treinado\s+em", "E treinado em"), |
| #304 | (r"hospeda", "Hospeda"), |
| #305 | (r"no\s+ambito\s+(de|do|da)", "Faz parte de"), |
| #306 | (r"lancado", "E lancado por"), |
| #307 | (r"testa\s+(em|com)", "Testa com"), |
| #308 | ], |
| #309 | } |
| #310 | |
| #311 | # Fallback relation per lingua |
| #312 | FALLBACK_RELATION = { |
| #313 | "it": "E' collegato a", |
| #314 | "en": "Is related to", |
| #315 | "fr": "Est lie a", |
| #316 | "de": "Ist verbunden mit", |
| #317 | "es": "Esta relacionado con", |
| #318 | "pt": "Esta relacionado com", |
| #319 | } |
| #320 | |
| #321 | # ========================================== |
| #322 | # TOKENIZATION |
| #323 | # ========================================== |
| #324 | _TOKEN_RE = re.compile(r"[a-zA-Z\u00C0-\u024F0-9]+") |
| #325 | |
| #326 | |
| #327 | def _normalize_token(token: str) -> str: |
| #328 | """Normalize a token: lowercase, remove accents.""" |
| #329 | token = token.lower() |
| #330 | nfkd = unicodedata.normalize("NFKD", token) |
| #331 | return "".join(c for c in nfkd if not unicodedata.combining(c)) |
| #332 | |
| #333 | |
| #334 | def _get_all_stopwords() -> frozenset: |
| #335 | """Return the union of all stopwords across all languages.""" |
| #336 | all_sw = set() |
| #337 | for sw_set in STOPWORDS.values(): |
| #338 | all_sw.update(sw_set) |
| #339 | return frozenset(all_sw) |
| #340 | |
| #341 | |
| #342 | _ALL_STOPWORDS = _get_all_stopwords() |
| #343 | |
| #344 | |
| #345 | def tokenize(text: str, lang: str | None = None) -> list[str]: |
| #346 | """Tokenize text, remove stopwords and normalize. |
| #347 | |
| #348 | If lang is None, uses the union of all stopwords (safe for multilingual content). |
| #349 | """ |
| #350 | sw = STOPWORDS.get(lang, _ALL_STOPWORDS) if lang else _ALL_STOPWORDS |
| #351 | raw_tokens = _TOKEN_RE.findall(text) |
| #352 | return [ |
| #353 | _normalize_token(t) |
| #354 | for t in raw_tokens |
| #355 | if len(t) > 2 and _normalize_token(t) not in sw |
| #356 | ] |
| #357 | |
| #358 | |
| #359 | # ========================================== |
| #360 | # LANGUAGE AUTO-DETECTION |
| #361 | # ========================================== |
| #362 | def detect_language(text: str) -> str: |
| #363 | """Detect text language based on stopword frequency. |
| #364 | |
| #365 | Returns language code (it, en, fr, de, es, pt). Default: 'en'. |
| #366 | """ |
| #367 | text_lower = text.lower() |
| #368 | raw_tokens = _TOKEN_RE.findall(text_lower) |
| #369 | normalized = [_normalize_token(t) for t in raw_tokens] |
| #370 | token_set = set(normalized) |
| #371 | |
| #372 | scores = {} |
| #373 | for lang, sw in STOPWORDS.items(): |
| #374 | overlap = token_set.intersection(sw) |
| #375 | scores[lang] = len(overlap) |
| #376 | |
| #377 | if not scores or max(scores.values()) == 0: |
| #378 | return "en" |
| #379 | |
| #380 | return max(scores, key=scores.get) |
| #381 | |
| #382 | |
| #383 | # ========================================== |
| #384 | # MULTILINGUAL DATE EXTRACTION |
| #385 | # ========================================== |
| #386 | def _build_date_regex() -> re.Pattern: |
| #387 | """Build a regex that matches dates in all supported languages.""" |
| #388 | all_month_names = [] |
| #389 | for months in MONTH_NAMES.values(): |
| #390 | all_month_names.extend(months.keys()) |
| #391 | months_pattern = "|".join(sorted(all_month_names, key=len, reverse=True)) |
| #392 | |
| #393 | pattern = ( |
| #394 | rf"(?:(?:{months_pattern})\s+\d{{4}})|" |
| #395 | rf"(?:\d{{4}}[-/]\d{{2}}(?:[-/]\d{{2}})?)|" |
| #396 | rf"(?:\d{{1,2}}\s+(?:{months_pattern})\s+\d{{4}})|" |
| #397 | r"(?:Q[1-4]\s+\d{4})" |
| #398 | ) |
| #399 | return re.compile(pattern, re.IGNORECASE) |
| #400 | |
| #401 | |
| #402 | _DATE_RE = _build_date_regex() |
| #403 | |
| #404 | |
| #405 | def extract_date_tokens(text: str) -> list[str]: |
| #406 | """Extract normalized temporal tokens from text in any supported language.""" |
| #407 | tokens = [] |
| #408 | for match in _DATE_RE.finditer(text): |
| #409 | date_str = match.group(0).lower() |
| #410 | tokens.append(_normalize_token(date_str)) |
| #411 | # Extract individual components |
| #412 | for month_name in _ALL_MONTHS: |
| #413 | if month_name in date_str: |
| #414 | tokens.append(month_name) |
| #415 | year_match = re.search(r"\d{4}", date_str) |
| #416 | if year_match: |
| #417 | tokens.append(year_match.group(0)) |
| #418 | return tokens |
| #419 | |
| #420 | |
| #421 | # ========================================== |
| #422 | # MULTILINGUAL RELATION INFERENCE |
| #423 | # ========================================== |
| #424 | def infer_relation(line: str, subject: str, target: str) -> str: |
| #425 | """Infer relation from line context, trying all languages.""" |
| #426 | line_lower = line.lower() |
| #427 | |
| #428 | # Try patterns from all languages |
| #429 | for lang, patterns in RELATION_PATTERNS.items(): |
| #430 | for pattern, relation in patterns: |
| #431 | if re.search(pattern, line_lower): |
| #432 | return relation |
| #433 | |
| #434 | # Detect line language for fallback |
| #435 | lang = detect_language(line) |
| #436 | return FALLBACK_RELATION.get(lang, "Is related to") |
| #437 |