repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | """ |
| #2 | Entity Sketching System |
| #3 | Lightweight entity extraction and fuzzy matching without heavy NLP dependencies. |
| #4 | |
| #5 | Uses regex patterns for entity extraction and pure Python Levenshtein distance |
| #6 | for fuzzy matching. No spaCy, no PyTorch, no external NLP libraries. |
| #7 | |
| #8 | Storage: TripleStore triples (subject=memory_id, predicate="mentions", object="entity_name") |
| #9 | """ |
| #10 | |
| #11 | import re |
| #12 | from typing import List, Optional, Set, Tuple |
| #13 | |
| #14 | |
| #15 | # ============================================================================= |
| #16 | # STOP WORDS — filtered from entity extraction |
| #17 | # ============================================================================= |
| #18 | |
| #19 | ENTITY_EXTRACTION_STOP_WORDS: Set[str] = { |
| #20 | "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", |
| #21 | "of", "with", "by", "from", "as", "is", "was", "are", "were", "be", |
| #22 | "been", "being", "have", "has", "had", "do", "does", "did", "will", |
| #23 | "would", "could", "should", "may", "might", "can", "shall", |
| #24 | "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", |
| #25 | "us", "them", "my", "your", "his", "its", "our", "their", |
| #26 | "this", "that", "these", "those", "here", "there", "where", |
| #27 | "when", "what", "which", "who", "whom", "whose", "how", "why", |
| #28 | } |
| #29 | |
| #30 | # Backward compatibility alias |
| #31 | _STOP_WORDS = ENTITY_EXTRACTION_STOP_WORDS |
| #32 | |
| #33 | |
| #34 | # ============================================================================= |
| #35 | # REGEX PATTERNS FOR ENTITY EXTRACTION |
| #36 | # ============================================================================= |
| #37 | |
| #38 | _ENTITY_PATTERNS = [ |
| #39 | # @mentions: @username |
| #40 | re.compile(r'@(\w{2,30})'), |
| #41 | # Hashtags: #topic |
| #42 | re.compile(r'#(\w{2,30})'), |
| #43 | # Quoted phrases: "Hello World" |
| #44 | re.compile(r'"([^"]{2,50})"'), |
| #45 | # Single-quoted phrases: 'Hello World' |
| #46 | re.compile(r"'([^']{2,50})'"), |
| #47 | # Capitalized word sequences (2-5 words): New York, Abdias J, San Francisco Bay Area |
| #48 | re.compile(r'\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*){1,4})\b'), |
| #49 | # Single capitalized word (fallback): Abdias, Python, John |
| #50 | re.compile(r'\b([A-Z][a-zA-Z]{1,20})\b'), |
| #51 | ] |
| #52 | |
| #53 | |
| #54 | # ============================================================================= |
| #55 | # 1. PURE PYTHON LEVENSHTEIN |
| #56 | # ============================================================================= |
| #57 | |
| #58 | def levenshtein_distance(s1: str, s2: str) -> int: |
| #59 | """ |
| #60 | Compute the Levenshtein edit distance between two strings. |
| #61 | Pure Python, zero dependencies. O(len(s1) * len(s2)) time, O(min) space. |
| #62 | """ |
| #63 | if len(s1) < len(s2): |
| #64 | s1, s2 = s2, s1 # ensure s2 is the shorter one |
| #65 | |
| #66 | if not s2: |
| #67 | return len(s1) |
| #68 | |
| #69 | # Use two rows (current and previous) to keep space O(min(len1, len2)) |
| #70 | previous_row = list(range(len(s2) + 1)) |
| #71 | current_row = [0] * (len(s2) + 1) |
| #72 | |
| #73 | for i, c1 in enumerate(s1): |
| #74 | current_row[0] = i + 1 |
| #75 | |
| #76 | for j, c2 in enumerate(s2): |
| #77 | # Cost: 0 if same character, 1 if different |
| #78 | insertions = previous_row[j + 1] + 1 |
| #79 | deletions = current_row[j] + 1 |
| #80 | substitutions = previous_row[j] + (0 if c1 == c2 else 1) |
| #81 | current_row[j + 1] = min(insertions, deletions, substitutions) |
| #82 | |
| #83 | # Swap rows |
| #84 | previous_row, current_row = current_row, previous_row |
| #85 | |
| #86 | return previous_row[len(s2)] |
| #87 | |
| #88 | |
| #89 | def similarity(s1: str, s2: str) -> float: |
| #90 | """ |
| #91 | Entity-aware similarity score: 1.0 = identical, 0.0 = completely different. |
| #92 | |
| #93 | Uses case-insensitive comparison with prefix/substring bonuses for |
| #94 | entity name matching (e.g., "Abdias" vs "Abdias J" = 0.925). |
| #95 | """ |
| #96 | s1_lower = s1.lower().strip() |
| #97 | s2_lower = s2.lower().strip() |
| #98 | |
| #99 | if s1_lower == s2_lower: |
| #100 | return 1.0 |
| #101 | |
| #102 | max_len = max(len(s1_lower), len(s2_lower)) |
| #103 | if max_len == 0: |
| #104 | return 1.0 |
| #105 | |
| #106 | # Prefix match bonus: 'Abdias' vs 'Abdias J' |
| #107 | if s1_lower.startswith(s2_lower) or s2_lower.startswith(s1_lower): |
| #108 | longer = max(len(s1_lower), len(s2_lower)) |
| #109 | shorter = min(len(s1_lower), len(s2_lower)) |
| #110 | return 0.7 + (shorter / longer) * 0.3 # 0.7 base + scaled bonus |
| #111 | |
| #112 | # Substring match: 'Mr. Smith' contains 'Smith' |
| #113 | if s1_lower in s2_lower or s2_lower in s1_lower: |
| #114 | longer = max(len(s1_lower), len(s2_lower)) |
| #115 | shorter = min(len(s1_lower), len(s2_lower)) |
| #116 | return 0.5 + (shorter / longer) * 0.3 |
| #117 | |
| #118 | dist = levenshtein_distance(s1_lower, s2_lower) |
| #119 | return 1.0 - (dist / max_len) |
| #120 | |
| #121 | |
| #122 | def extract_entities_regex(text: str) -> List[str]: |
| #123 | """ |
| #124 | Extract entity candidates from text using regex patterns. |
| #125 | |
| #126 | Returns list of unique entity strings. No external dependencies. |
| #127 | Filters out stop words, single lowercase words, and pure numbers. |
| #128 | """ |
| #129 | if not text or not isinstance(text, str): |
| #130 | return [] |
| #131 | |
| #132 | entities: Set[str] = set() |
| #133 | |
| #134 | for pattern in _ENTITY_PATTERNS: |
| #135 | for match in pattern.finditer(text): |
| #136 | entity = match.group(1).strip() |
| #137 | # Filter: must be at least 2 chars |
| #138 | if len(entity) < 2: |
| #139 | continue |
| #140 | # Filter out stop words (single word only) |
| #141 | words = entity.split() |
| #142 | if len(words) == 1 and entity.lower() in _STOP_WORDS: |
| #143 | continue |
| #144 | # Filter out pure numbers |
| #145 | if entity.replace('.', '').replace(',', '').isdigit(): |
| #146 | continue |
| #147 | # Filter out standalone lowercase words (unless quoted/mentioned) |
| #148 | # But allow @mentions and hashtags which are lowercase by nature |
| #149 | if len(words) == 1 and entity[0].islower() and not entity.startswith('@') and not entity.startswith('#'): |
| #150 | # Check if this entity came from an @mention or #hashtag pattern |
| #151 | # by looking at the original match position in the text |
| #152 | match_start = match.start(1) # start of group 1 (the captured entity) |
| #153 | if match_start > 0: |
| #154 | prefix_char = text[match_start - 1] if match_start > 0 else '' |
| #155 | if prefix_char in ('@', '#'): |
| #156 | pass # Allow @mentions and hashtags |
| #157 | else: |
| #158 | continue |
| #159 | else: |
| #160 | continue |
| #161 | entities.add(entity) |
| #162 | |
| #163 | # Post-process: merge adjacent capitalized words that appear together |
| #164 | # e.g., if we have "New" and "York" separately, but "New York" also matched, |
| #165 | # keep only the longest match |
| #166 | result = sorted(list(entities)) |
| #167 | |
| #168 | # Remove substrings that are part of longer entities |
| #169 | # But only for word-like entities (not @mentions or hashtags) |
| #170 | filtered: Set[str] = set() |
| #171 | for entity in result: |
| #172 | is_substring = False |
| #173 | for other in result: |
| #174 | if other != entity and entity in other: |
| #175 | # Don't remove @mentions or hashtags that happen to be substrings |
| #176 | if entity.startswith('@') or entity.startswith('#'): |
| #177 | continue |
| #178 | # Don't remove if the containing entity starts with @ or # |
| #179 | if other.startswith('@') or other.startswith('#'): |
| #180 | continue |
| #181 | is_substring = True |
| #182 | break |
| #183 | if not is_substring: |
| #184 | filtered.add(entity) |
| #185 | |
| #186 | return sorted(list(filtered)) |
| #187 | |
| #188 | |
| #189 | def find_similar_entities(entity: str, known_entities: List[str], threshold: float = 0.8) -> List[Tuple[str, float]]: |
| #190 | """ |
| #191 | Find known entities similar to the given entity. |
| #192 | |
| #193 | Returns list of (entity_name, similarity_score) tuples, sorted by score descending. |
| #194 | """ |
| #195 | matches: List[Tuple[str, float]] = [] |
| #196 | for known in known_entities: |
| #197 | if known == entity: |
| #198 | matches.append((known, 1.0)) |
| #199 | continue |
| #200 | sim = similarity(entity, known) |
| #201 | if sim >= threshold: |
| #202 | matches.append((known, sim)) |
| #203 | |
| #204 | matches.sort(key=lambda x: x[1], reverse=True) |
| #205 | return matches |
| #206 | |
| #207 | |
| #208 | def entity_extraction_performance(text: str, iterations: int = 1000) -> float: |
| #209 | """ |
| #210 | Measure entity extraction performance. |
| #211 | Returns average time per extraction in milliseconds. |
| #212 | """ |
| #213 | import time |
| #214 | start = time.perf_counter() |
| #215 | for _ in range(iterations): |
| #216 | extract_entities_regex(text) |
| #217 | elapsed = time.perf_counter() - start |
| #218 | return (elapsed / iterations) * 1000 |
| #219 |