my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	"""
#2	Unit tests for Mnemosyne Entity Sketching System.
#3
#4	Tests:
#5	- Levenshtein distance and similarity
#6	- Regex entity extraction
#7	- Fuzzy entity matching
#8	- Triple storage for entities
#9	"""
#10
#11	import sys
#12	import os
#13	import unittest
#14
#15	# Add mnemosyne to path
#16	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
#17
#18	from mnemosyne.core.entities import (
#19	levenshtein_distance,
#20	similarity,
#21	extract_entities_regex,
#22	find_similar_entities,
#23	ENTITY_EXTRACTION_STOP_WORDS,
#24	)
#25
#26
#27	class TestLevenshtein(unittest.TestCase):
#28	"""Test pure Python Levenshtein implementation."""
#29
#30	def test_exact_match(self):
#31	self.assertEqual(levenshtein_distance("hello", "hello"), 0)
#32	self.assertEqual(levenshtein_distance("", ""), 0)
#33
#34	def test_single_insertion(self):
#35	self.assertEqual(levenshtein_distance("cat", "cats"), 1)
#36
#37	def test_single_deletion(self):
#38	self.assertEqual(levenshtein_distance("cats", "cat"), 1)
#39
#40	def test_single_substitution(self):
#41	self.assertEqual(levenshtein_distance("cat", "cut"), 1)
#42
#43	def test_empty_string(self):
#44	self.assertEqual(levenshtein_distance("", "abc"), 3)
#45	self.assertEqual(levenshtein_distance("abc", ""), 3)
#46
#47	def test_unicode(self):
#48	self.assertEqual(levenshtein_distance("café", "cafe"), 1)
#49	self.assertEqual(levenshtein_distance("日本", "日本語"), 1)
#50
#51
#52	class TestSimilarity(unittest.TestCase):
#53	"""Test prefix-biased similarity function."""
#54
#55	def test_exact_match(self):
#56	self.assertEqual(similarity("Abdias", "Abdias"), 1.0)
#57
#58	def test_similar_names(self):
#59	# Abdias vs Abdias J. — should be high similarity
#60	self.assertGreater(similarity("Abdias", "Abdias J."), 0.8)
#61	# With prefix boost, should be even higher
#62	self.assertGreater(similarity("Abdias", "Abdias Moya"), 0.7)
#63
#64	def test_different_names(self):
#65	# Abdias vs Abdul — should be lower
#66	self.assertLess(similarity("Abdias", "Abdul"), 0.8)
#67	self.assertGreater(similarity("Abdias", "Abdul"), 0.3) # Some prefix overlap
#68
#69	def test_completely_different(self):
#70	self.assertLess(similarity("Abdias", "Zebra"), 0.3)
#71
#72	def test_case_insensitive(self):
#73	self.assertEqual(similarity("ABDIAS", "abdias"), 1.0)
#74
#75	def test_short_strings(self):
#76	self.assertEqual(similarity("A", "A"), 1.0)
#77	self.assertEqual(similarity("A", "B"), 0.0)
#78
#79	def test_partial_prefix(self):
#80	# "Abd" should match "Abdias" reasonably
#81	self.assertGreater(similarity("Abd", "Abdias"), 0.5)
#82
#83
#84	class TestRegexEntityExtraction(unittest.TestCase):
#85	"""Test regex-based entity extraction."""
#86
#87	def test_simple_name(self):
#88	result = extract_entities_regex("I met Abdias yesterday.")
#89	self.assertIn("Abdias", result)
#90
#91	def test_multiple_names(self):
#92	result = extract_entities_regex("Abdias and Maya went to New York.")
#93	self.assertIn("Abdias", result)
#94	self.assertIn("Maya", result)
#95	self.assertIn("New York", result)
#96
#97	def test_quoted_phrase(self):
#98	result = extract_entities_regex('She said "Hello World" to everyone.')
#99	self.assertIn("Hello World", result)
#100
#101	def test_at_mention(self):
#102	result = extract_entities_regex("Contact @abdias for help.")
#103	self.assertIn("@abdias", result)
#104
#105	def test_hashtag(self):
#106	result = extract_entities_regex("This is #ImportantNews today.")
#107	self.assertIn("#ImportantNews", result)
#108
#109	def test_stop_words_filtered(self):
#110	result = extract_entities_regex("The Quick Brown Fox")
#111	# "The" should be filtered as a stop word
#112	self.assertNotIn("The", result)
#113	self.assertIn("Quick Brown Fox", result)
#114
#115	def test_no_entities(self):
#116	result = extract_entities_regex("the quick brown fox jumps")
#117	# All lowercase, no entities expected
#118	self.assertEqual(len(result), 0)
#119
#120	def test_at_mention(self):
#121	result = extract_entities_regex("Contact @abdias for help.")
#122	# @mentions capture the word after @, not the @ itself
#123	self.assertIn("abdias", result)
#124
#125	def test_hashtag(self):
#126	result = extract_entities_regex("This is #ImportantNews today.")
#127	# Hashtags capture the word after #, not the # itself
#128	self.assertIn("ImportantNews", result)
#129
#130	def test_stop_words_filtered(self):
#131	result = extract_entities_regex("The Quick Brown Fox")
#132	# "The" should be filtered as a stop word, but the full phrase
#133	# "The Quick Brown Fox" is kept (capitalized sequence)
#134	self.assertNotIn("The", result)
#135	self.assertIn("The Quick Brown Fox", result)
#136
#137	def test_mixed_content(self):
#138	result = extract_entities_regex(
#139	"Abdias said: 'The Mnemosyne project is #Awesome. "
#140	"Contact @support or visit New York.'"
#141	)
#142	self.assertIn("Abdias", result)
#143	# "The Mnemosyne" is extracted as a capitalized sequence
#144	self.assertIn("The Mnemosyne", result)
#145	self.assertIn("Awesome", result) # from #Awesome
#146	self.assertIn("support", result) # from @support
#147	self.assertIn("New York", result)
#148
#149
#150	class TestFindSimilarEntities(unittest.TestCase):
#151	"""Test fuzzy entity matching against known entities."""
#152
#153	def test_exact_match(self):
#154	known = ["Abdias", "Maya", "Mnemosyne"]
#155	result = find_similar_entities("Abdias", known, threshold=0.8)
#156	self.assertEqual(result, [("Abdias", 1.0)])
#157
#158	def test_fuzzy_match(self):
#159	known = ["Abdias", "Maya", "Mnemosyne"]
#160	result = find_similar_entities("Abdias J.", known, threshold=0.8)
#161	self.assertIn(("Abdias", 0.8999999999999999), result)
#162
#163	def test_no_match_below_threshold(self):
#164	known = ["Abdias", "Maya"]
#165	result = find_similar_entities("Zebra", known, threshold=0.8)
#166	self.assertEqual(len(result), 0)
#167
#168	def test_multiple_matches(self):
#169	known = ["Abdias Moya", "Abdias J.", "Maya"]
#170	result = find_similar_entities("Abdias", known, threshold=0.7)
#171	# Should match both Abdias variants
#172	self.assertGreaterEqual(len(result), 1)
#173
#174	def test_case_insensitive_match(self):
#175	known = ["Abdias"]
#176	result = find_similar_entities("ABDIAS", known, threshold=0.8)
#177	self.assertEqual(result, [("Abdias", 1.0)])
#178
#179
#180	class TestStopWords(unittest.TestCase):
#181	"""Test stop words set."""
#182
#183	def test_common_stop_words_present(self):
#184	self.assertIn("the", ENTITY_EXTRACTION_STOP_WORDS)
#185	self.assertIn("and", ENTITY_EXTRACTION_STOP_WORDS)
#186	self.assertIn("for", ENTITY_EXTRACTION_STOP_WORDS)
#187
#188	def test_case_insensitive(self):
#189	# Stop words are lowercase
#190	self.assertIn("The".lower(), ENTITY_EXTRACTION_STOP_WORDS)
#191
#192
#193	class TestEdgeCases(unittest.TestCase):
#194	"""Test edge cases and error handling."""
#195
#196	def test_empty_string_extraction(self):
#197	result = extract_entities_regex("")
#198	self.assertEqual(len(result), 0)
#199
#200	def test_whitespace_only(self):
#201	result = extract_entities_regex(" \n\t ")
#202	self.assertEqual(len(result), 0)
#203
#204	def test_similarity_with_empty(self):
#205	self.assertEqual(similarity("", ""), 1.0)
#206	# Empty vs non-empty: prefix match path — shorter is empty (len 0),
#207	# longer is "abc" (len 3), so 0.7 + (0/3)*0.3 = 0.7
#208	self.assertEqual(similarity("abc", ""), 0.7)
#209	self.assertEqual(similarity("", "abc"), 0.7)
#210
#211	def test_levenshtein_with_none(self):
#212	# Should handle None gracefully or raise TypeError
#213	with self.assertRaises((TypeError, AttributeError)):
#214	levenshtein_distance(None, "abc")
#215
#216
#217	if __name__ == "__main__":
#218	unittest.main()
#219

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public