my-project-public

repository

loading code, commits, and activity

repositories

loading repo index

#1	import hashlib
#2	from unittest.mock import MagicMock
#3
#4	import pytest
#5
#6	from embedchain.chunkers.base_chunker import BaseChunker
#7	from embedchain.config.add_config import ChunkerConfig
#8	from embedchain.models.data_type import DataType
#9
#10
#11	@pytest.fixture
#12	def text_splitter_mock():
#13	return MagicMock()
#14
#15
#16	@pytest.fixture
#17	def loader_mock():
#18	return MagicMock()
#19
#20
#21	@pytest.fixture
#22	def app_id():
#23	return "test_app"
#24
#25
#26	@pytest.fixture
#27	def data_type():
#28	return DataType.TEXT
#29
#30
#31	@pytest.fixture
#32	def chunker(text_splitter_mock, data_type):
#33	text_splitter = text_splitter_mock
#34	chunker = BaseChunker(text_splitter)
#35	chunker.set_data_type(data_type)
#36	return chunker
#37
#38
#39	def test_create_chunks_with_config(chunker, text_splitter_mock, loader_mock, app_id, data_type):
#40	text_splitter_mock.split_text.return_value = ["Chunk 1", "long chunk"]
#41	loader_mock.load_data.return_value = {
#42	"data": [{"content": "Content 1", "meta_data": {"url": "URL 1"}}],
#43	"doc_id": "DocID",
#44	}
#45	config = ChunkerConfig(chunk_size=50, chunk_overlap=0, length_function=len, min_chunk_size=10)
#46	result = chunker.create_chunks(loader_mock, "test_src", app_id, config)
#47
#48	assert result["documents"] == ["long chunk"]
#49
#50
#51	def test_create_chunks(chunker, text_splitter_mock, loader_mock, app_id, data_type):
#52	text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"]
#53	loader_mock.load_data.return_value = {
#54	"data": [{"content": "Content 1", "meta_data": {"url": "URL 1"}}],
#55	"doc_id": "DocID",
#56	}
#57
#58	result = chunker.create_chunks(loader_mock, "test_src", app_id)
#59	expected_ids = [
#60	f"{app_id}--" + hashlib.sha256(("Chunk 1" + "URL 1").encode()).hexdigest(),
#61	f"{app_id}--" + hashlib.sha256(("Chunk 2" + "URL 1").encode()).hexdigest(),
#62	]
#63
#64	assert result["documents"] == ["Chunk 1", "Chunk 2"]
#65	assert result["ids"] == expected_ids
#66	assert result["metadatas"] == [
#67	{
#68	"url": "URL 1",
#69	"data_type": data_type.value,
#70	"doc_id": f"{app_id}--DocID",
#71	},
#72	{
#73	"url": "URL 1",
#74	"data_type": data_type.value,
#75	"doc_id": f"{app_id}--DocID",
#76	},
#77	]
#78	assert result["doc_id"] == f"{app_id}--DocID"
#79
#80
#81	def test_get_chunks(chunker, text_splitter_mock):
#82	text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"]
#83
#84	content = "This is a test content."
#85	result = chunker.get_chunks(content)
#86
#87	assert len(result) == 2
#88	assert result == ["Chunk 1", "Chunk 2"]
#89
#90
#91	def test_set_data_type(chunker):
#92	chunker.set_data_type(DataType.MDX)
#93	assert chunker.data_type == DataType.MDX
#94
#95
#96	def test_get_word_count(chunker):
#97	documents = ["This is a test.", "Another test."]
#98	result = chunker.get_word_count(documents)
#99	assert result == 6
#100

z6Mkq5mY3JWtxoxUobWcfNHm7AkRubgSWEZTkBVqZXJviFZ5/my-project-public