repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | """Chat text normalization for Mnemosyne ingestion (NAI-1). |
| #2 | |
| #3 | Parsers like ClausIE/MinIE were built for Wikipedia, not chat logs. |
| #4 | This module provides aggressive regex normalization to make casual |
| #5 | messages parseable by structured extraction tools. |
| #6 | |
| #7 | All algorithmic. Zero LLM calls. Zero new dependencies. |
| #8 | """ |
| #9 | |
| #10 | import re |
| #11 | from typing import Optional |
| #12 | |
| #13 | # ── Contraction expansion table ────────────────────────────── |
| #14 | # Matched against word boundaries, not spaces |
| #15 | _CONTRACTIONS: list[tuple[str, str]] = [ |
| #16 | (r"\bu\b", "you"), |
| #17 | (r"\bur\b", "your"), |
| #18 | (r"\bu're\b", "you are"), |
| #19 | (r"\br\b", "are"), |
| #20 | (r"\by\b", "why"), |
| #21 | (r"\bb4\b", "before"), |
| #22 | (r"\bbc\b", "because"), |
| #23 | (r"\bcuz\b", "because"), |
| #24 | (r"\bgonna\b", "going to"), |
| #25 | (r"\bwanna\b", "want to"), |
| #26 | (r"\bgotta\b", "got to"), |
| #27 | (r"\bkinda\b", "kind of"), |
| #28 | (r"\bsorta\b", "sort of"), |
| #29 | (r"\bdunno\b", "don't know"), |
| #30 | (r"\blemme\b", "let me"), |
| #31 | (r"\bgimme\b", "give me"), |
| #32 | (r"\boutta\b", "out of"), |
| #33 | (r"\bhafta\b", "have to"), |
| #34 | (r"\bshoulda\b", "should have"), |
| #35 | (r"\bwoulda\b", "would have"), |
| #36 | (r"\bcoulda\b", "could have"), |
| #37 | ] |
| #38 | |
| #39 | # ── Filler / reaction words to strip ────────────────────────── |
| #40 | _FILLER_WORDS: set[str] = { |
| #41 | "lol", "lmao", "lmaoo", "lmfao", "rofl", "omg", "omgg", |
| #42 | "omggg", "brb", "idk", "idc", "tbh", "imo", "imho", |
| #43 | "fwiw", "irl", "afaik", "iirc", "tldr", "nvm", "ikr", |
| #44 | "wtf", "smh", "fr", "ngl", "istg", "w", "wdym", |
| #45 | } |
| #46 | |
| #47 | # ── Fragment-starting verbs that need implicit subjects ─────── |
| #48 | _FRAGMENT_STARTERS: set[str] = { |
| #49 | "going", "coming", "thinking", "wondering", |
| #50 | "feeling", "trying", "hoping", "planning", |
| #51 | "working", "looking", "checking", "running", |
| #52 | "testing", "building", "fixing", "deploying", |
| #53 | } |
| #54 | |
| #55 | |
| #56 | def normalize_chat(text: str, *, add_implicit_subjects: bool = True) -> Optional[str]: |
| #57 | """Aggressive regex normalization for casual chat messages. |
| #58 | |
| #59 | Returns None if the message has no extractable meaning (too short, |
| #60 | only filler/reactions). |
| #61 | |
| #62 | Processing order: |
| #63 | 1. Lowercase |
| #64 | 2. Expand contractions via word-boundary regex (u → you, gonna → going to) |
| #65 | 3. Strip filler/reaction words (lol, omg, brb, etc.) |
| #66 | 4. Collapse repeated characters (omgggg → omg) |
| #67 | 5. Remove emojis and non-ASCII |
| #68 | 6. Normalize whitespace |
| #69 | 7. Fragment detection: too short (<2 meaningful words) = None |
| #70 | 8. Implicit subject injection (going → i am going) |
| #71 | |
| #72 | Args: |
| #73 | text: Raw chat message |
| #74 | add_implicit_subjects: If True, prepend 'i am' to fragments |
| #75 | that start with verbs |
| #76 | |
| #77 | Returns: |
| #78 | Normalized text, or None if no extractable meaning remains. |
| #79 | """ |
| #80 | if not text or not text.strip(): |
| #81 | return None |
| #82 | |
| #83 | # Step 1: Lowercase |
| #84 | text = text.lower().strip() |
| #85 | |
| #86 | # Step 2: Expand contractions (word-boundary regex) |
| #87 | for pattern, replacement in _CONTRACTIONS: |
| #88 | text = re.sub(pattern, replacement, text) |
| #89 | |
| #90 | # Step 3: Strip filler/reaction words |
| #91 | words = text.split() |
| #92 | meaningful = [w for w in words if w.strip(".,!?;:'\"") not in _FILLER_WORDS] |
| #93 | if not meaningful: |
| #94 | return None |
| #95 | text = " ".join(meaningful) |
| #96 | |
| #97 | # Step 4: Collapse repeated characters (omgggg → omg) |
| #98 | text = re.sub(r"(.)\1{2,}", r"\1", text) |
| #99 | |
| #100 | # Step 5: Remove emojis and non-ASCII |
| #101 | text = re.sub(r"[^\x00-\x7F]+", " ", text) |
| #102 | |
| #103 | # Step 6: Normalize whitespace |
| #104 | text = " ".join(text.split()) |
| #105 | |
| #106 | # Step 7: Fragment detection — need at least 2 meaningful words |
| #107 | word_count = len(text.split()) |
| #108 | if word_count < 2: |
| #109 | # Single long word might be a name/tool/endpoint |
| #110 | if word_count == 1 and len(text.split()[0]) > 5: |
| #111 | return text |
| #112 | return None |
| #113 | |
| #114 | # Step 8: Implicit subject injection (only for true fragments: 2 words) |
| #115 | if add_implicit_subjects and word_count == 2: |
| #116 | first_word = text.split()[0] if text else "" |
| #117 | if first_word in _FRAGMENT_STARTERS: |
| #118 | text = "i am " + text |
| #119 | |
| #120 | return text |
| #121 | |
| #122 | |
| #123 | def normalize_batch(messages: list[str]) -> list[Optional[str]]: |
| #124 | """Normalize a batch of messages. Returns None for unparseable ones. |
| #125 | |
| #126 | Useful for preprocessing entire conversations before entity extraction. |
| #127 | """ |
| #128 | return [normalize_chat(msg) for msg in messages] |
| #129 | |
| #130 | |
| #131 | # ── Diagnostics: measure what fraction of messages survive normalization ── |
| #132 | |
| #133 | def extraction_rate(messages: list[str]) -> dict: |
| #134 | """Check how many messages survive normalization. |
| #135 | |
| #136 | Returns: |
| #137 | Dict with total, survived, rate, and sample of dropped messages. |
| #138 | """ |
| #139 | normalized = normalize_batch(messages) |
| #140 | survived = [n for n in normalized if n is not None] |
| #141 | dropped = [m for m, n in zip(messages, normalized) if n is None] |
| #142 | |
| #143 | return { |
| #144 | "total": len(messages), |
| #145 | "survived": len(survived), |
| #146 | "dropped": len(dropped), |
| #147 | "rate": round(len(survived) / len(messages), 3) if messages else 0.0, |
| #148 | "dropped_samples": dropped[:5], |
| #149 | } |
| #150 |