repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | # ruff: noqa: E501 |
| #2 | |
| #3 | from embedchain.chunkers.text import TextChunker |
| #4 | from embedchain.config import ChunkerConfig |
| #5 | from embedchain.models.data_type import DataType |
| #6 | |
| #7 | |
| #8 | class TestTextChunker: |
| #9 | def test_chunks_without_app_id(self): |
| #10 | """ |
| #11 | Test the chunks generated by TextChunker. |
| #12 | """ |
| #13 | chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len, min_chunk_size=0) |
| #14 | chunker = TextChunker(config=chunker_config) |
| #15 | text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." |
| #16 | # Data type must be set manually in the test |
| #17 | chunker.set_data_type(DataType.TEXT) |
| #18 | result = chunker.create_chunks(MockLoader(), text, chunker_config) |
| #19 | documents = result["documents"] |
| #20 | assert len(documents) > 5 |
| #21 | |
| #22 | def test_chunks_with_app_id(self): |
| #23 | """ |
| #24 | Test the chunks generated by TextChunker with app_id |
| #25 | """ |
| #26 | chunker_config = ChunkerConfig(chunk_size=10, chunk_overlap=0, length_function=len, min_chunk_size=0) |
| #27 | chunker = TextChunker(config=chunker_config) |
| #28 | text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." |
| #29 | chunker.set_data_type(DataType.TEXT) |
| #30 | result = chunker.create_chunks(MockLoader(), text, chunker_config) |
| #31 | documents = result["documents"] |
| #32 | assert len(documents) > 5 |
| #33 | |
| #34 | def test_big_chunksize(self): |
| #35 | """ |
| #36 | Test that if an infinitely high chunk size is used, only one chunk is returned. |
| #37 | """ |
| #38 | chunker_config = ChunkerConfig(chunk_size=9999999999, chunk_overlap=0, length_function=len, min_chunk_size=0) |
| #39 | chunker = TextChunker(config=chunker_config) |
| #40 | text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." |
| #41 | # Data type must be set manually in the test |
| #42 | chunker.set_data_type(DataType.TEXT) |
| #43 | result = chunker.create_chunks(MockLoader(), text, chunker_config) |
| #44 | documents = result["documents"] |
| #45 | assert len(documents) == 1 |
| #46 | |
| #47 | def test_small_chunksize(self): |
| #48 | """ |
| #49 | Test that if a chunk size of one is used, every character is a chunk. |
| #50 | """ |
| #51 | chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0) |
| #52 | chunker = TextChunker(config=chunker_config) |
| #53 | # We can't test with lorem ipsum because chunks are deduped, so would be recurring characters. |
| #54 | text = """0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c""" |
| #55 | # Data type must be set manually in the test |
| #56 | chunker.set_data_type(DataType.TEXT) |
| #57 | result = chunker.create_chunks(MockLoader(), text, chunker_config) |
| #58 | documents = result["documents"] |
| #59 | assert len(documents) == len(text) |
| #60 | |
| #61 | def test_word_count(self): |
| #62 | chunker_config = ChunkerConfig(chunk_size=1, chunk_overlap=0, length_function=len, min_chunk_size=0) |
| #63 | chunker = TextChunker(config=chunker_config) |
| #64 | chunker.set_data_type(DataType.TEXT) |
| #65 | |
| #66 | document = ["ab cd", "ef gh"] |
| #67 | result = chunker.get_word_count(document) |
| #68 | assert result == 4 |
| #69 | |
| #70 | |
| #71 | class MockLoader: |
| #72 | @staticmethod |
| #73 | def load_data(src) -> dict: |
| #74 | """ |
| #75 | Mock loader that returns a list of data dictionaries. |
| #76 | Adjust this method to return different data for testing. |
| #77 | """ |
| #78 | return { |
| #79 | "doc_id": "123", |
| #80 | "data": [ |
| #81 | { |
| #82 | "content": src, |
| #83 | "meta_data": {"url": "none"}, |
| #84 | } |
| #85 | ], |
| #86 | } |
| #87 |