my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	"""Chat text normalization for Mnemosyne ingestion (NAI-1).
#2
#3	Parsers like ClausIE/MinIE were built for Wikipedia, not chat logs.
#4	This module provides aggressive regex normalization to make casual
#5	messages parseable by structured extraction tools.
#6
#7	All algorithmic. Zero LLM calls. Zero new dependencies.
#8	"""
#9
#10	import re
#11	from typing import Optional
#12
#13	# ── Contraction expansion table ──────────────────────────────
#14	# Matched against word boundaries, not spaces
#15	_CONTRACTIONS: list[tuple[str, str]] = [
#16	(r"\bu\b", "you"),
#17	(r"\bur\b", "your"),
#18	(r"\bu're\b", "you are"),
#19	(r"\br\b", "are"),
#20	(r"\by\b", "why"),
#21	(r"\bb4\b", "before"),
#22	(r"\bbc\b", "because"),
#23	(r"\bcuz\b", "because"),
#24	(r"\bgonna\b", "going to"),
#25	(r"\bwanna\b", "want to"),
#26	(r"\bgotta\b", "got to"),
#27	(r"\bkinda\b", "kind of"),
#28	(r"\bsorta\b", "sort of"),
#29	(r"\bdunno\b", "don't know"),
#30	(r"\blemme\b", "let me"),
#31	(r"\bgimme\b", "give me"),
#32	(r"\boutta\b", "out of"),
#33	(r"\bhafta\b", "have to"),
#34	(r"\bshoulda\b", "should have"),
#35	(r"\bwoulda\b", "would have"),
#36	(r"\bcoulda\b", "could have"),
#37	]
#38
#39	# ── Filler / reaction words to strip ──────────────────────────
#40	_FILLER_WORDS: set[str] = {
#41	"lol", "lmao", "lmaoo", "lmfao", "rofl", "omg", "omgg",
#42	"omggg", "brb", "idk", "idc", "tbh", "imo", "imho",
#43	"fwiw", "irl", "afaik", "iirc", "tldr", "nvm", "ikr",
#44	"wtf", "smh", "fr", "ngl", "istg", "w", "wdym",
#45	}
#46
#47	# ── Fragment-starting verbs that need implicit subjects ───────
#48	_FRAGMENT_STARTERS: set[str] = {
#49	"going", "coming", "thinking", "wondering",
#50	"feeling", "trying", "hoping", "planning",
#51	"working", "looking", "checking", "running",
#52	"testing", "building", "fixing", "deploying",
#53	}
#54
#55
#56	def normalize_chat(text: str, *, add_implicit_subjects: bool = True) -> Optional[str]:
#57	"""Aggressive regex normalization for casual chat messages.
#58
#59	Returns None if the message has no extractable meaning (too short,
#60	only filler/reactions).
#61
#62	Processing order:
#63	1. Lowercase
#64	2. Expand contractions via word-boundary regex (u → you, gonna → going to)
#65	3. Strip filler/reaction words (lol, omg, brb, etc.)
#66	4. Collapse repeated characters (omgggg → omg)
#67	5. Remove emojis and non-ASCII
#68	6. Normalize whitespace
#69	7. Fragment detection: too short (<2 meaningful words) = None
#70	8. Implicit subject injection (going → i am going)
#71
#72	Args:
#73	text: Raw chat message
#74	add_implicit_subjects: If True, prepend 'i am' to fragments
#75	that start with verbs
#76
#77	Returns:
#78	Normalized text, or None if no extractable meaning remains.
#79	"""
#80	if not text or not text.strip():
#81	return None
#82
#83	# Step 1: Lowercase
#84	text = text.lower().strip()
#85
#86	# Step 2: Expand contractions (word-boundary regex)
#87	for pattern, replacement in _CONTRACTIONS:
#88	text = re.sub(pattern, replacement, text)
#89
#90	# Step 3: Strip filler/reaction words
#91	words = text.split()
#92	meaningful = [w for w in words if w.strip(".,!?;:'\"") not in _FILLER_WORDS]
#93	if not meaningful:
#94	return None
#95	text = " ".join(meaningful)
#96
#97	# Step 4: Collapse repeated characters (omgggg → omg)
#98	text = re.sub(r"(.)\1{2,}", r"\1", text)
#99
#100	# Step 5: Remove emojis and non-ASCII
#101	text = re.sub(r"[^\x00-\x7F]+", " ", text)
#102
#103	# Step 6: Normalize whitespace
#104	text = " ".join(text.split())
#105
#106	# Step 7: Fragment detection — need at least 2 meaningful words
#107	word_count = len(text.split())
#108	if word_count < 2:
#109	# Single long word might be a name/tool/endpoint
#110	if word_count == 1 and len(text.split()[0]) > 5:
#111	return text
#112	return None
#113
#114	# Step 8: Implicit subject injection (only for true fragments: 2 words)
#115	if add_implicit_subjects and word_count == 2:
#116	first_word = text.split()[0] if text else ""
#117	if first_word in _FRAGMENT_STARTERS:
#118	text = "i am " + text
#119
#120	return text
#121
#122
#123	def normalize_batch(messages: list[str]) -> list[Optional[str]]:
#124	"""Normalize a batch of messages. Returns None for unparseable ones.
#125
#126	Useful for preprocessing entire conversations before entity extraction.
#127	"""
#128	return [normalize_chat(msg) for msg in messages]
#129
#130
#131	# ── Diagnostics: measure what fraction of messages survive normalization ──
#132
#133	def extraction_rate(messages: list[str]) -> dict:
#134	"""Check how many messages survive normalization.
#135
#136	Returns:
#137	Dict with total, survived, rate, and sample of dropped messages.
#138	"""
#139	normalized = normalize_batch(messages)
#140	survived = [n for n in normalized if n is not None]
#141	dropped = [m for m, n in zip(messages, normalized) if n is None]
#142
#143	return {
#144	"total": len(messages),
#145	"survived": len(survived),
#146	"dropped": len(dropped),
#147	"rate": round(len(survived) / len(messages), 3) if messages else 0.0,
#148	"dropped_samples": dropped[:5],
#149	}
#150

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public