Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Migrate Notion source to Connector V2 structure #162

Open
wants to merge 48 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
d618716
Mantain the current v1 file
Oct 9, 2024
5171d9b
finished
Oct 9, 2024
7b93cb6
Black format and changelog add
Oct 10, 2024
2946f52
Fix Makefile issue with lint
Oct 10, 2024
3fc17e8
more fixes
Oct 10, 2024
5d83d7c
restore main files
Oct 10, 2024
40bc3ff
More restored files from main
Oct 10, 2024
1682181
removed useless variable
Oct 10, 2024
cff69aa
Optimized imports
Oct 10, 2024
98e6398
few ruff fixes
Oct 10, 2024
7269c66
last ruff fix
Oct 10, 2024
8b844e0
version updated
Oct 10, 2024
698d715
added notion-client to base.in
Oct 11, 2024
16885f3
remove unused error
Oct 11, 2024
70ee2a5
changed reference
Oct 11, 2024
4bcde56
ruff fix
Oct 11, 2024
4be07b0
Fixed and saving files now
Oct 15, 2024
314a068
Merge branch 'main' into DS-87-notion-source-migration
Oct 15, 2024
9a4147e
addressed
Oct 15, 2024
7c5a5d9
Roman Access Config request addressed
Oct 16, 2024
4ffe9c6
Library type_check done
Oct 16, 2024
1fc45a4
black fix
Oct 16, 2024
f2bc82d
merge main solve conlicts
Oct 16, 2024
04f518f
version file matching
Oct 16, 2024
d43985e
More libraries that needed to be capsulated
Oct 16, 2024
d767980
merge main
Oct 17, 2024
622c05b
Remove leftover comment
Oct 17, 2024
3cbaa50
Multiple PR changes assigned
Oct 18, 2024
2e4f45b
merge main
Oct 18, 2024
7820f09
fixes
Oct 18, 2024
c0c7efe
tries
Oct 18, 2024
f58db9e
More Client
Oct 18, 2024
d5d3339
most done
Oct 18, 2024
0e000b1
missed this
Oct 18, 2024
3cc4086
trying
Oct 18, 2024
f82c0b0
black
Oct 18, 2024
9f02b74
version change
Oct 18, 2024
e4d8118
async client
Oct 18, 2024
2579ace
connector.py updates
Oct 21, 2024
2d4a1d7
autopep8 updates
Oct 21, 2024
b3802b2
merge main
Oct 21, 2024
502aa1a
Roman comments addressed
Oct 22, 2024
abd9f1a
version bump
Oct 22, 2024
d2263ea
params issue
Oct 22, 2024
450aff6
stop ignoring Notion
Oct 23, 2024
3672f3a
merge conflict
Oct 23, 2024
4b1e612
my bad, versions dont match
Oct 23, 2024
047dabf
Async Indexes, making it work
Oct 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
## 0.1.2-dev5

### Enhancements

* **Migrate Notion Source Connector to V2**

## 0.1.1-dev4

### Enhancements
Expand Down
3 changes: 1 addition & 2 deletions test_e2e/src/notion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
--num-processes "$max_processes" \
--recursive \
--verbose \
--work-dir "$WORK_DIR" \
--max-retry-time 30
--work-dir "$WORK_DIR"

"$SCRIPT_DIR"/check-diff-expected-output.py --output-folder-name $OUTPUT_FOLDER_NAME
1 change: 0 additions & 1 deletion test_e2e/test-dest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ trap print_last_run EXIT
python_version=$(python --version 2>&1)

tests_to_ignore=(
'notion.sh'
'dropbox.sh'
'sharepoint.sh'
)
Expand Down
4 changes: 1 addition & 3 deletions test_e2e/test-src.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ trap print_last_run EXIT

python_version=$(python --version 2>&1)

tests_to_ignore=(
'notion.sh'
)
tests_to_ignore=()

for test in "${all_tests[@]}"; do
CURRENT_TEST="$test"
Expand Down
2 changes: 1 addition & 1 deletion unstructured_ingest/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.1-dev4" # pragma: no cover
__version__ = "0.1.2-dev5" # pragma: no cover
3 changes: 3 additions & 0 deletions unstructured_ingest/v2/processes/connectors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
from .milvus import milvus_destination_entry
from .mongodb import CONNECTOR_TYPE as MONGODB_CONNECTOR_TYPE
from .mongodb import mongodb_destination_entry, mongodb_source_entry
from .notion.connector import CONNECTOR_TYPE as NOTION_CONNECTOR_TYPE
from .notion.connector import notion_source_entry
from .onedrive import CONNECTOR_TYPE as ONEDRIVE_CONNECTOR_TYPE
from .onedrive import onedrive_source_entry
from .opensearch import CONNECTOR_TYPE as OPENSEARCH_CONNECTOR_TYPE
Expand Down Expand Up @@ -99,6 +101,7 @@

add_destination_entry(destination_type=KDBAI_CONNECTOR_TYPE, entry=kdbai_destination_entry)
add_source_entry(source_type=AIRTABLE_CONNECTOR_TYPE, entry=airtable_source_entry)
add_source_entry(source_type=NOTION_CONNECTOR_TYPE, entry=notion_source_entry)

add_source_entry(source_type=OUTLOOK_CONNECTOR_TYPE, entry=outlook_source_entry)

Expand Down
Empty file.
108 changes: 108 additions & 0 deletions unstructured_ingest/v2/processes/connectors/notion/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from typing import Any, Generator

import httpx
from notion_client import Client as NotionClient
from notion_client.api_endpoints import BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint
from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint
from notion_client.errors import HTTPResponseError, RequestTimeoutError

from unstructured_ingest.v2.processes.connectors.notion.types.block import Block
from unstructured_ingest.v2.processes.connectors.notion.types.database import Database
from unstructured_ingest.v2.processes.connectors.notion.types.database_properties import map_cells
from unstructured_ingest.v2.processes.connectors.notion.types.page import Page


class AsyncBlocksChildrenEndpoint(NotionBlocksChildrenEndpoint):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._http_client = httpx.AsyncClient()

async def list(self, block_id: str, **kwargs: Any) -> tuple[list[Block], dict]:
"""Fetch the list of child blocks asynchronously."""
try:
response = await self._http_client.get(
f"{self.parent._api_base}/blocks/{block_id}/children", **kwargs
)
response.raise_for_status()
except httpx.HTTPStatusError as e:
raise HTTPResponseError(f"Failed to list blocks: {str(e)}")
except httpx.TimeoutException:
raise RequestTimeoutError()

resp = response.json()
child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])]
return child_blocks, resp

async def iterate_list(
self, block_id: str, **kwargs: Any
) -> Generator[list[Block], None, None]:
"""Fetch the list of child blocks in pages asynchronously."""
next_cursor = None
while True:
params = {"start_cursor": next_cursor} if next_cursor else {}
params.update(kwargs)
child_blocks, response = await self.list(block_id, **params)
yield child_blocks

next_cursor = response.get("next_cursor")
if not response.get("has_more") or not next_cursor:
return

async def close(self):
"""Close the HTTP client."""
await self._http_client.aclose()


class AsyncDatabasesEndpoint(NotionDatabasesEndpoint):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._http_client = httpx.AsyncClient()

async def retrieve(self, database_id: str, **kwargs: Any) -> Database:
"""Fetch a database by its ID asynchronously."""
try:
response = await self._http_client.get(
f"{self.parent._api_base}/databases/{database_id}", **kwargs
)
response.raise_for_status()
except httpx.HTTPStatusError as e:
raise HTTPResponseError(f"Failed to retrieve database: {str(e)}")
except httpx.TimeoutException:
raise RequestTimeoutError()

return Database.from_dict(data=response.json())

async def query(self, database_id: str, **kwargs: Any) -> tuple[list[Page], dict]:
"""Query a database asynchronously."""
try:
response = await self._http_client.post(
f"{self.parent._api_base}/databases/{database_id}/query",
json=kwargs.get("json", {}),
)
response.raise_for_status()
except httpx.HTTPStatusError as e:
raise HTTPResponseError(f"Failed to query database: {str(e)}")
except httpx.TimeoutException:
raise RequestTimeoutError()

resp = response.json()
pages = [Page.from_dict(data=p) for p in resp.pop("results", [])]
for p in pages:
p.properties = map_cells(p.properties)
return pages, resp

async def close(self):
"""Close the HTTP client."""
await self._http_client.aclose()


class AsyncClient(NotionClient):
def __init__(self, *args: Any, **kwargs: Any) -> None:
super().__init__(*args, **kwargs)
self.blocks = AsyncBlocksChildrenEndpoint(parent=self)
self.databases = AsyncDatabasesEndpoint(parent=self)

async def close(self):
"""Close all async endpoints."""
await self.blocks.close()
await self.databases.close()
Loading
Loading