my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	"""
#2	Entity Sketching System
#3	Lightweight entity extraction and fuzzy matching without heavy NLP dependencies.
#4
#5	Uses regex patterns for entity extraction and pure Python Levenshtein distance
#6	for fuzzy matching. No spaCy, no PyTorch, no external NLP libraries.
#7
#8	Storage: TripleStore triples (subject=memory_id, predicate="mentions", object="entity_name")
#9	"""
#10
#11	import re
#12	from typing import List, Optional, Set, Tuple
#13
#14
#15	# =============================================================================
#16	# STOP WORDS — filtered from entity extraction
#17	# =============================================================================
#18
#19	ENTITY_EXTRACTION_STOP_WORDS: Set[str] = {
#20	"the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for",
#21	"of", "with", "by", "from", "as", "is", "was", "are", "were", "be",
#22	"been", "being", "have", "has", "had", "do", "does", "did", "will",
#23	"would", "could", "should", "may", "might", "can", "shall",
#24	"i", "you", "he", "she", "it", "we", "they", "me", "him", "her",
#25	"us", "them", "my", "your", "his", "its", "our", "their",
#26	"this", "that", "these", "those", "here", "there", "where",
#27	"when", "what", "which", "who", "whom", "whose", "how", "why",
#28	}
#29
#30	# Backward compatibility alias
#31	_STOP_WORDS = ENTITY_EXTRACTION_STOP_WORDS
#32
#33
#34	# =============================================================================
#35	# REGEX PATTERNS FOR ENTITY EXTRACTION
#36	# =============================================================================
#37
#38	_ENTITY_PATTERNS = [
#39	# @mentions: @username
#40	re.compile(r'@(\w{2,30})'),
#41	# Hashtags: #topic
#42	re.compile(r'#(\w{2,30})'),
#43	# Quoted phrases: "Hello World"
#44	re.compile(r'"([^"]{2,50})"'),
#45	# Single-quoted phrases: 'Hello World'
#46	re.compile(r"'([^']{2,50})'"),
#47	# Capitalized word sequences (2-5 words): New York, Abdias J, San Francisco Bay Area
#48	re.compile(r'\b([A-Z][a-zA-Z](?:\s+[A-Z][a-zA-Z]){1,4})\b'),
#49	# Single capitalized word (fallback): Abdias, Python, John
#50	re.compile(r'\b([A-Z][a-zA-Z]{1,20})\b'),
#51	]
#52
#53
#54	# =============================================================================
#55	# 1. PURE PYTHON LEVENSHTEIN
#56	# =============================================================================
#57
#58	def levenshtein_distance(s1: str, s2: str) -> int:
#59	"""
#60	Compute the Levenshtein edit distance between two strings.
#61	Pure Python, zero dependencies. O(len(s1) * len(s2)) time, O(min) space.
#62	"""
#63	if len(s1) < len(s2):
#64	s1, s2 = s2, s1 # ensure s2 is the shorter one
#65
#66	if not s2:
#67	return len(s1)
#68
#69	# Use two rows (current and previous) to keep space O(min(len1, len2))
#70	previous_row = list(range(len(s2) + 1))
#71	current_row = [0] * (len(s2) + 1)
#72
#73	for i, c1 in enumerate(s1):
#74	current_row[0] = i + 1
#75
#76	for j, c2 in enumerate(s2):
#77	# Cost: 0 if same character, 1 if different
#78	insertions = previous_row[j + 1] + 1
#79	deletions = current_row[j] + 1
#80	substitutions = previous_row[j] + (0 if c1 == c2 else 1)
#81	current_row[j + 1] = min(insertions, deletions, substitutions)
#82
#83	# Swap rows
#84	previous_row, current_row = current_row, previous_row
#85
#86	return previous_row[len(s2)]
#87
#88
#89	def similarity(s1: str, s2: str) -> float:
#90	"""
#91	Entity-aware similarity score: 1.0 = identical, 0.0 = completely different.
#92
#93	Uses case-insensitive comparison with prefix/substring bonuses for
#94	entity name matching (e.g., "Abdias" vs "Abdias J" = 0.925).
#95	"""
#96	s1_lower = s1.lower().strip()
#97	s2_lower = s2.lower().strip()
#98
#99	if s1_lower == s2_lower:
#100	return 1.0
#101
#102	max_len = max(len(s1_lower), len(s2_lower))
#103	if max_len == 0:
#104	return 1.0
#105
#106	# Prefix match bonus: 'Abdias' vs 'Abdias J'
#107	if s1_lower.startswith(s2_lower) or s2_lower.startswith(s1_lower):
#108	longer = max(len(s1_lower), len(s2_lower))
#109	shorter = min(len(s1_lower), len(s2_lower))
#110	return 0.7 + (shorter / longer) * 0.3 # 0.7 base + scaled bonus
#111
#112	# Substring match: 'Mr. Smith' contains 'Smith'
#113	if s1_lower in s2_lower or s2_lower in s1_lower:
#114	longer = max(len(s1_lower), len(s2_lower))
#115	shorter = min(len(s1_lower), len(s2_lower))
#116	return 0.5 + (shorter / longer) * 0.3
#117
#118	dist = levenshtein_distance(s1_lower, s2_lower)
#119	return 1.0 - (dist / max_len)
#120
#121
#122	def extract_entities_regex(text: str) -> List[str]:
#123	"""
#124	Extract entity candidates from text using regex patterns.
#125
#126	Returns list of unique entity strings. No external dependencies.
#127	Filters out stop words, single lowercase words, and pure numbers.
#128	"""
#129	if not text or not isinstance(text, str):
#130	return []
#131
#132	entities: Set[str] = set()
#133
#134	for pattern in _ENTITY_PATTERNS:
#135	for match in pattern.finditer(text):
#136	entity = match.group(1).strip()
#137	# Filter: must be at least 2 chars
#138	if len(entity) < 2:
#139	continue
#140	# Filter out stop words (single word only)
#141	words = entity.split()
#142	if len(words) == 1 and entity.lower() in _STOP_WORDS:
#143	continue
#144	# Filter out pure numbers
#145	if entity.replace('.', '').replace(',', '').isdigit():
#146	continue
#147	# Filter out standalone lowercase words (unless quoted/mentioned)
#148	# But allow @mentions and hashtags which are lowercase by nature
#149	if len(words) == 1 and entity[0].islower() and not entity.startswith('@') and not entity.startswith('#'):
#150	# Check if this entity came from an @mention or #hashtag pattern
#151	# by looking at the original match position in the text
#152	match_start = match.start(1) # start of group 1 (the captured entity)
#153	if match_start > 0:
#154	prefix_char = text[match_start - 1] if match_start > 0 else ''
#155	if prefix_char in ('@', '#'):
#156	pass # Allow @mentions and hashtags
#157	else:
#158	continue
#159	else:
#160	continue
#161	entities.add(entity)
#162
#163	# Post-process: merge adjacent capitalized words that appear together
#164	# e.g., if we have "New" and "York" separately, but "New York" also matched,
#165	# keep only the longest match
#166	result = sorted(list(entities))
#167
#168	# Remove substrings that are part of longer entities
#169	# But only for word-like entities (not @mentions or hashtags)
#170	filtered: Set[str] = set()
#171	for entity in result:
#172	is_substring = False
#173	for other in result:
#174	if other != entity and entity in other:
#175	# Don't remove @mentions or hashtags that happen to be substrings
#176	if entity.startswith('@') or entity.startswith('#'):
#177	continue
#178	# Don't remove if the containing entity starts with @ or #
#179	if other.startswith('@') or other.startswith('#'):
#180	continue
#181	is_substring = True
#182	break
#183	if not is_substring:
#184	filtered.add(entity)
#185
#186	return sorted(list(filtered))
#187
#188
#189	def find_similar_entities(entity: str, known_entities: List[str], threshold: float = 0.8) -> List[Tuple[str, float]]:
#190	"""
#191	Find known entities similar to the given entity.
#192
#193	Returns list of (entity_name, similarity_score) tuples, sorted by score descending.
#194	"""
#195	matches: List[Tuple[str, float]] = []
#196	for known in known_entities:
#197	if known == entity:
#198	matches.append((known, 1.0))
#199	continue
#200	sim = similarity(entity, known)
#201	if sim >= threshold:
#202	matches.append((known, sim))
#203
#204	matches.sort(key=lambda x: x[1], reverse=True)
#205	return matches
#206
#207
#208	def entity_extraction_performance(text: str, iterations: int = 1000) -> float:
#209	"""
#210	Measure entity extraction performance.
#211	Returns average time per extraction in milliseconds.
#212	"""
#213	import time
#214	start = time.perf_counter()
#215	for _ in range(iterations):
#216	extract_entities_regex(text)
#217	elapsed = time.perf_counter() - start
#218	return (elapsed / iterations) * 1000
#219

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public