feast/sdk/python/feast/nlp_test_data.py at java-update · feast-dev/feast

History

67 lines (58 loc) · 2.5 KB

Raw

from datetime import datetime

from typing import Dict

import numpy as np

import pandas as pd

def create_document_chunks_df(

documents: Dict[str, str],

start_date: datetime,

end_date: datetime,

embedding_size: int = 60,

) -> pd.DataFrame:

"""

Example df generated by this function:

|------------------+-------------+----------+------------------+-----------+------------------|

| 2021-03-17 19:31 | doc_1 | chunk-1 | Hello world | [0.1, ...]| 2021-03-24 19:34 |

| 2021-03-17 19:31 | doc_1 | chunk-2 | How are you? | [0.2, ...]| 2021-03-24 19:34 |

| 2021-03-17 19:31 | doc_2 | chunk-1 | This is a test | [0.3, ...]| 2021-03-24 19:34 |

| 2021-03-17 19:31 | doc_2 | chunk-2 | Document chunk | [0.4, ...]| 2021-03-24 19:34 |

"""

df_hourly = pd.DataFrame(

{

"event_timestamp": [

pd.Timestamp(dt, unit="ms").round("ms")

for dt in pd.date_range(

start=start_date,

end=end_date,

freq="1h",

inclusive="left",

tz="UTC",

)

]

+ [

pd.Timestamp(

year=2021, month=4, day=12, hour=7, minute=0, second=0, tz="UTC"

)

]

}

)

df_all_chunks = pd.DataFrame()

for doc_id, doc_text in documents.items():

chunks = doc_text.split(". ") # Simple chunking by sentence

for chunk_id, chunk_text in enumerate(chunks, start=1):

df_hourly_copy = df_hourly.copy()

df_hourly_copy["document_id"] = doc_id

df_hourly_copy["chunk_id"] = f"chunk-{chunk_id}"

df_hourly_copy["chunk_text"] = chunk_text

df_all_chunks = pd.concat([df_hourly_copy, df_all_chunks])

df_all_chunks.reset_index(drop=True, inplace=True)

rows = df_all_chunks["event_timestamp"].count()

# Generate random embeddings for each chunk

df_all_chunks["embedding"] = [

np.random.rand(embedding_size).tolist() for _ in range(rows)

]

df_all_chunks["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms"))

# Create duplicate rows that should be filtered by created timestamp

late_row = df_all_chunks[rows // 2 : rows // 2 + 1]

df_all_chunks = pd.concat([df_all_chunks, late_row, late_row], ignore_index=True)

return df_all_chunks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

nlp_test_data.py

Latest commit

History

nlp_test_data.py

File metadata and controls