repositories
loading repo index
repositories
loading repo index
repository
loading code, commits, and activity
public Clawd ADK gateway launch mirror
stars
latest
clone command
git clone gitlawb://did:key:z6Mkq5mY...iFZ5/my-project-publ...git clone gitlawb://did:key:z6Mkq5mY.../my-project-publ...2fa351d6docs: add automaton and perps launch sources16d ago| #1 | import hashlib |
| #2 | from unittest.mock import Mock, patch |
| #3 | |
| #4 | import pytest |
| #5 | import requests |
| #6 | |
| #7 | from embedchain.loaders.web_page import WebPageLoader |
| #8 | |
| #9 | |
| #10 | @pytest.fixture |
| #11 | def web_page_loader(): |
| #12 | return WebPageLoader() |
| #13 | |
| #14 | |
| #15 | def test_load_data(web_page_loader): |
| #16 | page_url = "https://example.com/page" |
| #17 | mock_response = Mock() |
| #18 | mock_response.status_code = 200 |
| #19 | mock_response.content = """ |
| #20 | <html> |
| #21 | <head> |
| #22 | <title>Test Page</title> |
| #23 | </head> |
| #24 | <body> |
| #25 | <div id="content"> |
| #26 | <p>This is some test content.</p> |
| #27 | </div> |
| #28 | </body> |
| #29 | </html> |
| #30 | """ |
| #31 | with patch("embedchain.loaders.web_page.WebPageLoader._session.get", return_value=mock_response): |
| #32 | result = web_page_loader.load_data(page_url) |
| #33 | |
| #34 | content = web_page_loader._get_clean_content(mock_response.content, page_url) |
| #35 | expected_doc_id = hashlib.sha256((content + page_url).encode()).hexdigest() |
| #36 | assert result["doc_id"] == expected_doc_id |
| #37 | |
| #38 | expected_data = [ |
| #39 | { |
| #40 | "content": content, |
| #41 | "meta_data": { |
| #42 | "url": page_url, |
| #43 | }, |
| #44 | } |
| #45 | ] |
| #46 | |
| #47 | assert result["data"] == expected_data |
| #48 | |
| #49 | |
| #50 | def test_get_clean_content_excludes_unnecessary_info(web_page_loader): |
| #51 | mock_html = """ |
| #52 | <html> |
| #53 | <head> |
| #54 | <title>Sample HTML</title> |
| #55 | <style> |
| #56 | /* Stylesheet to be excluded */ |
| #57 | .elementor-location-header { |
| #58 | background-color: #f0f0f0; |
| #59 | } |
| #60 | </style> |
| #61 | </head> |
| #62 | <body> |
| #63 | <header id="header">Header Content</header> |
| #64 | <nav class="nav">Nav Content</nav> |
| #65 | <aside>Aside Content</aside> |
| #66 | <form>Form Content</form> |
| #67 | <main>Main Content</main> |
| #68 | <footer class="footer">Footer Content</footer> |
| #69 | <script>Some Script</script> |
| #70 | <noscript>NoScript Content</noscript> |
| #71 | <svg>SVG Content</svg> |
| #72 | <canvas>Canvas Content</canvas> |
| #73 | |
| #74 | <div id="sidebar">Sidebar Content</div> |
| #75 | <div id="main-navigation">Main Navigation Content</div> |
| #76 | <div id="menu-main-menu">Menu Main Menu Content</div> |
| #77 | |
| #78 | <div class="header-sidebar-wrapper">Header Sidebar Wrapper Content</div> |
| #79 | <div class="blog-sidebar-wrapper">Blog Sidebar Wrapper Content</div> |
| #80 | <div class="related-posts">Related Posts Content</div> |
| #81 | </body> |
| #82 | </html> |
| #83 | """ |
| #84 | |
| #85 | tags_to_exclude = [ |
| #86 | "nav", |
| #87 | "aside", |
| #88 | "form", |
| #89 | "header", |
| #90 | "noscript", |
| #91 | "svg", |
| #92 | "canvas", |
| #93 | "footer", |
| #94 | "script", |
| #95 | "style", |
| #96 | ] |
| #97 | ids_to_exclude = ["sidebar", "main-navigation", "menu-main-menu"] |
| #98 | classes_to_exclude = [ |
| #99 | "elementor-location-header", |
| #100 | "navbar-header", |
| #101 | "nav", |
| #102 | "header-sidebar-wrapper", |
| #103 | "blog-sidebar-wrapper", |
| #104 | "related-posts", |
| #105 | ] |
| #106 | |
| #107 | content = web_page_loader._get_clean_content(mock_html, "https://example.com/page") |
| #108 | |
| #109 | for tag in tags_to_exclude: |
| #110 | assert tag not in content |
| #111 | |
| #112 | for id in ids_to_exclude: |
| #113 | assert id not in content |
| #114 | |
| #115 | for class_name in classes_to_exclude: |
| #116 | assert class_name not in content |
| #117 | |
| #118 | assert len(content) > 0 |
| #119 | |
| #120 | |
| #121 | def test_fetch_reference_links_success(web_page_loader): |
| #122 | # Mock a successful response |
| #123 | response = Mock(spec=requests.Response) |
| #124 | response.status_code = 200 |
| #125 | response.content = b""" |
| #126 | <html> |
| #127 | <body> |
| #128 | <a href="http://example.com">Example</a> |
| #129 | <a href="https://another-example.com">Another Example</a> |
| #130 | <a href="/relative-link">Relative Link</a> |
| #131 | </body> |
| #132 | </html> |
| #133 | """ |
| #134 | |
| #135 | expected_links = ["http://example.com", "https://another-example.com"] |
| #136 | result = web_page_loader.fetch_reference_links(response) |
| #137 | assert result == expected_links |
| #138 | |
| #139 | |
| #140 | def test_fetch_reference_links_failure(web_page_loader): |
| #141 | # Mock a failed response |
| #142 | response = Mock(spec=requests.Response) |
| #143 | response.status_code = 404 |
| #144 | response.content = b"" |
| #145 | |
| #146 | expected_links = [] |
| #147 | result = web_page_loader.fetch_reference_links(response) |
| #148 | assert result == expected_links |
| #149 |