-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Expand file tree
/
Copy pathnlp_test_data.py
More file actions
67 lines (58 loc) · 2.5 KB
/
nlp_test_data.py
File metadata and controls
67 lines (58 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from datetime import datetime
from typing import Dict
import numpy as np
import pandas as pd
def create_document_chunks_df(
documents: Dict[str, str],
start_date: datetime,
end_date: datetime,
embedding_size: int = 60,
) -> pd.DataFrame:
"""
Example df generated by this function:
| event_timestamp | document_id | chunk_id | chunk_text | embedding | created |
|------------------+-------------+----------+------------------+-----------+------------------|
| 2021-03-17 19:31 | doc_1 | chunk-1 | Hello world | [0.1, ...]| 2021-03-24 19:34 |
| 2021-03-17 19:31 | doc_1 | chunk-2 | How are you? | [0.2, ...]| 2021-03-24 19:34 |
| 2021-03-17 19:31 | doc_2 | chunk-1 | This is a test | [0.3, ...]| 2021-03-24 19:34 |
| 2021-03-17 19:31 | doc_2 | chunk-2 | Document chunk | [0.4, ...]| 2021-03-24 19:34 |
"""
df_hourly = pd.DataFrame(
{
"event_timestamp": [
pd.Timestamp(dt, unit="ms").round("ms")
for dt in pd.date_range(
start=start_date,
end=end_date,
freq="1h",
inclusive="left",
tz="UTC",
)
]
+ [
pd.Timestamp(
year=2021, month=4, day=12, hour=7, minute=0, second=0, tz="UTC"
)
]
}
)
df_all_chunks = pd.DataFrame()
for doc_id, doc_text in documents.items():
chunks = doc_text.split(". ") # Simple chunking by sentence
for chunk_id, chunk_text in enumerate(chunks, start=1):
df_hourly_copy = df_hourly.copy()
df_hourly_copy["document_id"] = doc_id
df_hourly_copy["chunk_id"] = f"chunk-{chunk_id}"
df_hourly_copy["chunk_text"] = chunk_text
df_all_chunks = pd.concat([df_hourly_copy, df_all_chunks])
df_all_chunks.reset_index(drop=True, inplace=True)
rows = df_all_chunks["event_timestamp"].count()
# Generate random embeddings for each chunk
df_all_chunks["embedding"] = [
np.random.rand(embedding_size).tolist() for _ in range(rows)
]
df_all_chunks["created"] = pd.to_datetime(pd.Timestamp.now(tz=None).round("ms"))
# Create duplicate rows that should be filtered by created timestamp
late_row = df_all_chunks[rows // 2 : rows // 2 + 1]
df_all_chunks = pd.concat([df_all_chunks, late_row, late_row], ignore_index=True)
return df_all_chunks