from datetime import datetime from typing import Dict import numpy as np import pandas as pd def create_document_chunks_df( documents: Dict[str, str], start_date: datetime, end_date: datetime, embedding_size: int = 60, ) -> pd.DataFrame: """ Example df generated by this function: | event_timestamp | document_id | chunk_id | chunk_text | embedding | created | |------------------+-------------+----------+------------------+-----------+------------------| | 2021-03-17 19:31 | doc_1 | chunk-1 | Hello world | [0.1, ...]| 2021-03-24 19:34 | | 2021-03-17 19:31 | doc_1 | chunk-2 | How are you? | [0.2, ...]| 2021-03-24 19:34 | | 2021-03-17 19:31 | doc_2 | chunk-1 | This is a test | [0.3, ...]| 2021-03-24 19:34 | | 2021-03-17 19:31 | doc_2 | chunk-2 | Document chunk | [0.4, ...]| 2021-03-24 19:34 | """ df_hourly = pd.DataFrame( { "event_timestamp": [ pd.Timestamp(dt, unit="ms").round("ms") for dt in pd.date_range( start=start_date, end=end_date, freq="1h", inclusive="left", tz="UTC", ) ] + [ pd.Timestamp( year=2021, month=4, day=12, hour=7, minute=0, second=0, tz="UTC" ) ] } ) df_all_chunks = pd.DataFrame() for doc_id, doc_text in documents.items(): chunks = doc_text.split(". ") # Simple chunking by sentence for chunk_id, chunk_text in enumerate(chunks, start=1): df_hourly_copy = df_hourly.copy() df_hourly_copy["document_id"] = doc_id df_hourly_copy["chunk_id"] = f"chunk-{chunk_id}" df_hourly_copy["chunk_text"] = chunk_text df_all_chunks = pd.concat([df_hourly_copy, df_all_chunks]) df_all_chunks.reset_index(drop=True, inplace=True) rows = df_all_chunks["event_timestamp"].count() # Generate random embeddings for each chunk df_all_chunks["embedding"] = [ np.random.rand(embedding_size).tolist() for _ in range(rows) ] df_all_chunks["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms")) # Create duplicate rows that should be filtered by created timestamp late_row = df_all_chunks[rows // 2 : rows // 2 + 1] df_all_chunks = pd.concat([df_all_chunks, late_row, late_row], ignore_index=True) return df_all_chunks