repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | from unittest.mock import MagicMock |
| #3 | |
| #4 | import pytest |
| #5 | |
| #6 | from embedchain.chunkers.base_chunker import BaseChunker |
| #7 | from embedchain.config.add_config import ChunkerConfig |
| #8 | from embedchain.models.data_type import DataType |
| #9 | |
| #10 | |
| #11 | @pytest.fixture |
| #12 | def text_splitter_mock(): |
| #13 | return MagicMock() |
| #14 | |
| #15 | |
| #16 | @pytest.fixture |
| #17 | def loader_mock(): |
| #18 | return MagicMock() |
| #19 | |
| #20 | |
| #21 | @pytest.fixture |
| #22 | def app_id(): |
| #23 | return "test_app" |
| #24 | |
| #25 | |
| #26 | @pytest.fixture |
| #27 | def data_type(): |
| #28 | return DataType.TEXT |
| #29 | |
| #30 | |
| #31 | @pytest.fixture |
| #32 | def chunker(text_splitter_mock, data_type): |
| #33 | text_splitter = text_splitter_mock |
| #34 | chunker = BaseChunker(text_splitter) |
| #35 | chunker.set_data_type(data_type) |
| #36 | return chunker |
| #37 | |
| #38 | |
| #39 | def test_create_chunks_with_config(chunker, text_splitter_mock, loader_mock, app_id, data_type): |
| #40 | text_splitter_mock.split_text.return_value = ["Chunk 1", "long chunk"] |
| #41 | loader_mock.load_data.return_value = { |
| #42 | "data": [{"content": "Content 1", "meta_data": {"url": "URL 1"}}], |
| #43 | "doc_id": "DocID", |
| #44 | } |
| #45 | config = ChunkerConfig(chunk_size=50, chunk_overlap=0, length_function=len, min_chunk_size=10) |
| #46 | result = chunker.create_chunks(loader_mock, "test_src", app_id, config) |
| #47 | |
| #48 | assert result["documents"] == ["long chunk"] |
| #49 | |
| #50 | |
| #51 | def test_create_chunks(chunker, text_splitter_mock, loader_mock, app_id, data_type): |
| #52 | text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"] |
| #53 | loader_mock.load_data.return_value = { |
| #54 | "data": [{"content": "Content 1", "meta_data": {"url": "URL 1"}}], |
| #55 | "doc_id": "DocID", |
| #56 | } |
| #57 | |
| #58 | result = chunker.create_chunks(loader_mock, "test_src", app_id) |
| #59 | expected_ids = [ |
| #60 | f"{app_id}--" + hashlib.sha256(("Chunk 1" + "URL 1").encode()).hexdigest(), |
| #61 | f"{app_id}--" + hashlib.sha256(("Chunk 2" + "URL 1").encode()).hexdigest(), |
| #62 | ] |
| #63 | |
| #64 | assert result["documents"] == ["Chunk 1", "Chunk 2"] |
| #65 | assert result["ids"] == expected_ids |
| #66 | assert result["metadatas"] == [ |
| #67 | { |
| #68 | "url": "URL 1", |
| #69 | "data_type": data_type.value, |
| #70 | "doc_id": f"{app_id}--DocID", |
| #71 | }, |
| #72 | { |
| #73 | "url": "URL 1", |
| #74 | "data_type": data_type.value, |
| #75 | "doc_id": f"{app_id}--DocID", |
| #76 | }, |
| #77 | ] |
| #78 | assert result["doc_id"] == f"{app_id}--DocID" |
| #79 | |
| #80 | |
| #81 | def test_get_chunks(chunker, text_splitter_mock): |
| #82 | text_splitter_mock.split_text.return_value = ["Chunk 1", "Chunk 2"] |
| #83 | |
| #84 | content = "This is a test content." |
| #85 | result = chunker.get_chunks(content) |
| #86 | |
| #87 | assert len(result) == 2 |
| #88 | assert result == ["Chunk 1", "Chunk 2"] |
| #89 | |
| #90 | |
| #91 | def test_set_data_type(chunker): |
| #92 | chunker.set_data_type(DataType.MDX) |
| #93 | assert chunker.data_type == DataType.MDX |
| #94 | |
| #95 | |
| #96 | def test_get_word_count(chunker): |
| #97 | documents = ["This is a test.", "Another test."] |
| #98 | result = chunker.get_word_count(documents) |
| #99 | assert result == 6 |
| #100 |