-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_process.py
More file actions
42 lines (39 loc) · 959 Bytes
/
Copy pathdata_process.py
File metadata and controls
42 lines (39 loc) · 959 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
import data_helpers
def write_to_file(file,line):
file.write(line+"\n")
def cat_map():
catmap={}
id=1
f=open("cat")
cat=set([s.strip() for s in list(f.readlines())])
for i in cat:
catmap[i]=id
id=id+1
return catmap
tree = ET.ElementTree(file="test.xml")
root = tree.getroot()
cnn=open("cnn","a")
lstm=open("lstm","a")
cat=open("cat","a")
for vespaadd in root:
document = vespaadd.find("document")
if(document!=None):
subject = document.find("subject")
content = document.find("content")
maincat = document.find("maincat")
if(subject==None):
continue
if(content==None):
content=subject
if(maincat==None):
continue
write_to_file(cnn,data_helpers.clean_str(subject.text))
write_to_file(lstm,data_helpers.clean_str(content.text))
write_to_file(cat,data_helpers.clean_str(maincat.text))
cnn.close()
lstm.close()
cat.close()