-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Expand file tree
/
Copy pathdocument_labeling.py
More file actions
96 lines (76 loc) · 2.94 KB
/
document_labeling.py
File metadata and controls
96 lines (76 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import json
from typing import Any, Dict, List, Optional
from feast.feature import Feature
class DocumentLabel:
def __init__(
self,
chunk_id: str,
document_id: str,
label: str,
confidence: Optional[float] = None,
metadata: Optional[Dict[str, Any]] = None,
):
self.chunk_id = chunk_id
self.document_id = document_id
self.label = label
self.confidence = confidence
self.metadata = metadata or {}
def to_dict(self) -> Dict[str, Any]:
return {
"chunk_id": self.chunk_id,
"document_id": self.document_id,
"label": self.label,
"confidence": self.confidence,
"metadata": self.metadata,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "DocumentLabel":
return cls(
chunk_id=data["chunk_id"],
document_id=data["document_id"],
label=data["label"],
confidence=data.get("confidence"),
metadata=data.get("metadata", {}),
)
def store_document_label(feature: Feature, label: DocumentLabel) -> None:
if not hasattr(feature, "labels") or feature.labels is None:
if hasattr(feature, "_labels"):
feature._labels = {}
else:
return
labels_dict = feature.labels if hasattr(feature, "labels") else feature._labels
labels_key = "document_labels"
if labels_key not in labels_dict:
labels_dict[labels_key] = "[]"
existing_labels = json.loads(labels_dict[labels_key])
existing_labels.append(label.to_dict())
labels_dict[labels_key] = json.dumps(existing_labels)
def get_document_labels(feature: Feature) -> List[DocumentLabel]:
labels_dict = None
if hasattr(feature, "labels") and feature.labels:
labels_dict = feature.labels
elif hasattr(feature, "_labels") and feature._labels:
labels_dict = feature._labels
if not labels_dict or "document_labels" not in labels_dict:
return []
labels_data = json.loads(labels_dict["document_labels"])
return [DocumentLabel.from_dict(label_dict) for label_dict in labels_data]
def remove_document_label(feature: Feature, chunk_id: str, document_id: str) -> bool:
labels_dict = None
if hasattr(feature, "labels") and feature.labels:
labels_dict = feature.labels
elif hasattr(feature, "_labels") and feature._labels:
labels_dict = feature._labels
if not labels_dict or "document_labels" not in labels_dict:
return False
existing_labels = json.loads(labels_dict["document_labels"])
original_length = len(existing_labels)
filtered_labels = [
label
for label in existing_labels
if not (label["chunk_id"] == chunk_id and label["document_id"] == document_id)
]
if len(filtered_labels) < original_length:
labels_dict["document_labels"] = json.dumps(filtered_labels)
return True
return False