-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpython_bindings.cpp
More file actions
314 lines (295 loc) · 13.9 KB
/
python_bindings.cpp
File metadata and controls
314 lines (295 loc) · 13.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/**
* Python bindings for docx_comment_parser using pybind11.
*
* Build command (example):
* c++ -O2 -std=c++17 -fPIC -shared \
* -I../include $(python3-config --includes) \
* -I$(python3 -c "import pybind11; print(pybind11.get_include())") \
* python_bindings.cpp ../src/docx_parser.cpp ../src/batch_parser.cpp \
* ../src/zip_reader.cpp \
* -lz \
* -o docx_comment_parser$(python3-config --extension-suffix)
*
* Usage from Python:
* import docx_comment_parser as dcp
* parser = dcp.DocxParser()
* parser.parse("my_file.docx")
* for c in parser.comments():
* print(c.author, c.text)
*/
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/operators.h>
#include "docx_comment_parser.h"
namespace py = pybind11;
using namespace docx;
PYBIND11_MODULE(docx_comment_parser, m) {
m.doc() = R"doc(
docx_comment_parser
===================
Fast C++ library for extracting all comment metadata from .docx files.
Classes
-------
CommentRef – lightweight reference to a related comment
CommentMetadata – all data extracted for one comment
DocumentCommentStats– document-level aggregate statistics
DocxParser – single-file parser
BatchParser – multi-file parallel parser
)doc";
// ── CommentRef ────────────────────────────────────────────────────────────
py::class_<CommentRef>(m, "CommentRef",
"Lightweight reference to a related (replied-to) comment.")
.def_readonly("id", &CommentRef::id,
"Comment id (w:id attribute).")
.def_readonly("author", &CommentRef::author,
"Author name of the referenced comment.")
.def_readonly("date", &CommentRef::date,
"ISO-8601 date string of the referenced comment.")
.def_readonly("text_snippet", &CommentRef::text_snippet,
"First 120 characters of the referenced comment's text.")
.def("__repr__", [](const CommentRef& r){
return "<CommentRef id=" + std::to_string(r.id)
+ " author='" + r.author + "'>";
});
// ── CommentMetadata ───────────────────────────────────────────────────────
py::class_<CommentMetadata>(m, "CommentMetadata", R"doc(
All metadata extracted for a single comment (w:comment element).
Attributes
----------
id : int – w:id
author : str – w:author
date : str – ISO-8601 date (as stored in XML)
initials : str – w:initials
text : str – full plain-text of comment body
paragraph_style : str – style of first paragraph inside comment
range_start_para_id : str – paraId of commentRangeStart (OOXML 2016+)
range_end_para_id : str – paraId of commentRangeEnd (OOXML 2016+)
referenced_text : str – document text anchored by this comment
is_reply : bool – True if this comment is a reply
parent_id : int – id of parent comment (-1 if root)
replies : list[CommentRef] – direct replies (on parent)
para_id : str – unique per-comment paragraph id
para_id_parent : str – parent paragraph id string
done : bool – resolved/done flag (OOXML 2016+)
paragraph_index : int – 0-based paragraph in document body
run_index : int – 0-based run within paragraph
thread_ids : list[int] – ordered ids of entire thread (root only)
)doc")
.def_readonly("id", &CommentMetadata::id)
.def_readonly("author", &CommentMetadata::author)
.def_readonly("date", &CommentMetadata::date)
.def_readonly("initials", &CommentMetadata::initials)
.def_readonly("text", &CommentMetadata::text)
.def_readonly("paragraph_style", &CommentMetadata::paragraph_style)
.def_readonly("range_start_para_id", &CommentMetadata::range_start_para_id)
.def_readonly("range_end_para_id", &CommentMetadata::range_end_para_id)
.def_readonly("referenced_text", &CommentMetadata::referenced_text)
.def_readonly("is_reply", &CommentMetadata::is_reply)
.def_readonly("parent_id", &CommentMetadata::parent_id)
.def_readonly("replies", &CommentMetadata::replies)
.def_readonly("para_id", &CommentMetadata::para_id)
.def_readonly("para_id_parent", &CommentMetadata::para_id_parent)
.def_readonly("done", &CommentMetadata::done)
.def_readonly("paragraph_index", &CommentMetadata::paragraph_index)
.def_readonly("run_index", &CommentMetadata::run_index)
.def_readonly("thread_ids", &CommentMetadata::thread_ids)
.def("to_dict", [](const CommentMetadata& m) {
py::dict d;
d["id"] = m.id;
d["author"] = m.author;
d["date"] = m.date;
d["initials"] = m.initials;
d["text"] = m.text;
d["paragraph_style"] = m.paragraph_style;
d["range_start_para_id"] = m.range_start_para_id;
d["range_end_para_id"] = m.range_end_para_id;
d["referenced_text"] = m.referenced_text;
d["is_reply"] = m.is_reply;
d["parent_id"] = m.parent_id;
d["para_id"] = m.para_id;
d["para_id_parent"] = m.para_id_parent;
d["done"] = m.done;
d["paragraph_index"] = m.paragraph_index;
d["run_index"] = m.run_index;
d["thread_ids"] = m.thread_ids;
py::list replies;
for (const auto& r : m.replies) {
py::dict rd;
rd["id"] = r.id;
rd["author"] = r.author;
rd["date"] = r.date;
rd["text_snippet"] = r.text_snippet;
replies.append(rd);
}
d["replies"] = replies;
return d;
}, "Return all metadata as a Python dict.")
.def("__repr__", [](const CommentMetadata& m){
return "<CommentMetadata id=" + std::to_string(m.id)
+ " author='" + m.author + "'"
+ (m.is_reply ? " [reply]" : "") + ">";
});
// ── DocumentCommentStats ─────────────────────────────────────────────────
py::class_<DocumentCommentStats>(m, "DocumentCommentStats", R"doc(
Document-level comment statistics.
Attributes
----------
file_path : str
total_comments : int
total_resolved : int – comments with done=True
total_replies : int
total_root_comments : int
unique_authors : list[str]
earliest_date : str – ISO-8601
latest_date : str – ISO-8601
)doc")
.def_readonly("file_path", &DocumentCommentStats::file_path)
.def_readonly("total_comments", &DocumentCommentStats::total_comments)
.def_readonly("total_resolved", &DocumentCommentStats::total_resolved)
.def_readonly("total_replies", &DocumentCommentStats::total_replies)
.def_readonly("total_root_comments",&DocumentCommentStats::total_root_comments)
.def_readonly("unique_authors", &DocumentCommentStats::unique_authors)
.def_readonly("earliest_date", &DocumentCommentStats::earliest_date)
.def_readonly("latest_date", &DocumentCommentStats::latest_date)
.def("to_dict", [](const DocumentCommentStats& s){
py::dict d;
d["file_path"] = s.file_path;
d["total_comments"] = s.total_comments;
d["total_resolved"] = s.total_resolved;
d["total_replies"] = s.total_replies;
d["total_root_comments"] = s.total_root_comments;
d["unique_authors"] = s.unique_authors;
d["earliest_date"] = s.earliest_date;
d["latest_date"] = s.latest_date;
return d;
}, "Return stats as a Python dict.")
.def("__repr__", [](const DocumentCommentStats& s){
return "<DocumentCommentStats total=" + std::to_string(s.total_comments)
+ " file='" + s.file_path + "'>";
});
// ── DocxParser ────────────────────────────────────────────────────────────
py::class_<DocxParser>(m, "DocxParser", R"doc(
Single-file .docx comment parser.
Example
-------
>>> import docx_comment_parser as dcp
>>> p = dcp.DocxParser()
>>> p.parse("report.docx")
>>> for c in p.comments():
... print(c.id, c.author, c.text[:60])
)doc")
.def(py::init<>())
.def("parse",
&DocxParser::parse,
py::arg("file_path"),
R"doc(
Parse a .docx file and extract all comment metadata.
Parameters
----------
file_path : str
Absolute or relative path to the .docx file.
Raises
------
DocxFileError if the file cannot be opened.
DocxFormatError if required OOXML parts are missing or malformed.
)doc")
.def("comments",
&DocxParser::comments,
py::return_value_policy::reference_internal,
"Return list of all CommentMetadata objects (sorted by id).")
.def("stats",
&DocxParser::stats,
py::return_value_policy::reference_internal,
"Return DocumentCommentStats for the parsed file.")
.def("find_by_id",
[](const DocxParser& self, int id) -> py::object {
const CommentMetadata* m = self.find_by_id(id);
if (!m) return py::none();
return py::cast(*m);
},
py::arg("id"),
"Return CommentMetadata for the given id, or None if not found.")
.def("by_author",
[](const DocxParser& self, const std::string& author){
auto ptrs = self.by_author(author);
py::list result;
for (auto* p : ptrs) result.append(*p);
return result;
},
py::arg("author"),
"Return list of CommentMetadata authored by the given person.")
.def("root_comments",
[](const DocxParser& self){
auto ptrs = self.root_comments();
py::list result;
for (auto* p : ptrs) result.append(*p);
return result;
},
"Return non-reply root comments in document order.")
.def("thread",
[](const DocxParser& self, int root_id){
auto ptrs = self.thread(root_id);
py::list result;
for (auto* p : ptrs) result.append(*p);
return result;
},
py::arg("root_id"),
"Return ordered list of CommentMetadata forming the thread for root_id.");
// ── BatchParser ───────────────────────────────────────────────────────────
py::class_<BatchParser>(m, "BatchParser", R"doc(
Multi-file parallel .docx comment parser.
Example
-------
>>> import docx_comment_parser as dcp, glob
>>> bp = dcp.BatchParser(max_threads=4)
>>> bp.parse_all(glob.glob("/docs/*.docx"))
>>> for path in glob.glob("/docs/*.docx"):
... print(path, bp.stats(path).total_comments)
>>> bp.release_all()
)doc")
.def(py::init<unsigned int>(),
py::arg("max_threads") = 0u,
"Create a BatchParser. max_threads=0 uses all CPU cores.")
.def("parse_all",
&BatchParser::parse_all,
py::arg("file_paths"),
py::call_guard<py::gil_scoped_release>(),
R"doc(
Parse a list of .docx files in parallel.
Files that fail are recorded in errors() rather than raising.
Parameters
----------
file_paths : list[str]
)doc")
.def("comments",
[](const BatchParser& self, const std::string& fp) {
return self.comments(fp);
},
py::arg("file_path"),
"Return list of CommentMetadata for a previously parsed file.")
.def("stats",
[](const BatchParser& self, const std::string& fp){
return self.stats(fp);
},
py::arg("file_path"),
"Return DocumentCommentStats for a previously parsed file.")
.def("errors",
[](const BatchParser& self){
py::dict d;
for (const auto& kv : self.errors())
d[py::str(kv.first)] = kv.second;
return d;
},
"Return dict of {file_path: error_message} for files that failed.")
.def("release",
&BatchParser::release,
py::arg("file_path"),
"Free memory for a specific parsed file.")
.def("release_all",
&BatchParser::release_all,
"Free memory for all parsed files.");
// ── Exception types ───────────────────────────────────────────────────────
py::register_exception<DocxFileError> (m, "DocxFileError", PyExc_IOError);
py::register_exception<DocxFormatError>(m, "DocxFormatError", PyExc_ValueError);
py::register_exception<DocxParserError>(m, "DocxParserError", PyExc_RuntimeError);
}