-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMarxWordEnvironment.py
More file actions
170 lines (137 loc) · 5.16 KB
/
MarxWordEnvironment.py
File metadata and controls
170 lines (137 loc) · 5.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# MarxWordEnvironment.py
# Generate word environment csv files.
import sys
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
import enchant
do_write_out = True
def marx_do_word_environment(corpus, word_to_find,
out_directory='./data/word_environment/'):
# german stopwords
stop_words = set(stopwords.words('german'))
stop_words.update(['dass', 'sein', 'sei', 'war', 'den', 'ein', 'wurde',
'wurden', 'auf', 'trotzdem', 'ausser', 'daher', 'dah'])
# redirect all printed text to temp.txt
# this is because concordance prints to stdout, and we want it
if do_write_out:
sys.stdout = open('temp.txt', 'w')
with open(corpus, 'rU', encoding="utf8") as Kapital:
raw = Kapital.read()
# fixes words divided by hyphens at the end of the line
raw = raw.replace('-\n', '')
raw = raw.replace('\n', ' ')
# remove punctuation, tokenize text
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(raw)
# get rid of stopwords
without_stops = []
for w in tokens:
if w not in stop_words:
without_stops.append(w.lower())
# create NLTK text from the tokens in order to perform all the linguistic
# processing that NLTK allows us to do
text = nltk.Text(without_stops)
stemmer = nltk.stem.snowball.GermanStemmer()
stemmed_target = stemmer.stem(word_to_find).lower()
# stem all occurrences of our specific word
word_count = len(text)
stemmeds = 0
new_text = []
for x in range(word_count):
word = text[x].lower()
s = stemmer.stem(word)
if s == stemmed_target:
new_text.append(s)
stemmeds += 1
else:
new_text.append(word)
new_text = nltk.Text(new_text)
# this gets written inside of temp.txt
tmp = text.concordance(stemmed_target, width=400, lines=stemmeds)
# reset stdout back to normal
sys.stdout.close()
sys.stdout = sys.__stdout__
german_dict = enchant.Dict('de_DE')
# now read in the concordance
with open('temp.txt', 'r') as concordance_file:
raw_concordance_lines = concordance_file.readlines()
conc_lines = [line.strip() for line in raw_concordance_lines]
# remove first line of the file. it's a metadata line
conc_lines = conc_lines[1:]
# loop1: clean up, make a word inventory, generate freqdists
word_inventory = set()
counter = 0
mcs = []
for line in conc_lines:
conc_tokens = nltk.word_tokenize(line, language='german')
without_stops = []
for word in conc_tokens:
if word not in stop_words and not word.isdigit():
without_stops.append(word)
words = []
for word in without_stops:
# is in dictionary?
if german_dict.check(word) and len(word) > 1:
stemmed = stemmer.stem(word)
words.append(stemmed)
# calculate frequency distribution
tokens = nltk.Text(words)
fdist = nltk.FreqDist(tokens)
mc = fdist.most_common(200)
mcs.append(mc)
# build word inventory
for word, hits in mc:
word_inventory.add(word)
counter += 1
word_scores = dict()
word_scorelist = dict()
for word in word_inventory:
word_scores[word] = 0
conc_counter = 0
for mc in mcs:
conc_counter += 1
for word in word_inventory:
word_scores[word] *= 0.5
if word_scores[word] < 0.05:
word_scores[word] = 0
for word, hits in mc:
word_scores[word] = (word_scores[word] + 1) * hits
for word in word_inventory:
if word not in word_scorelist:
word_scorelist[word] = []
word_scorelist[word].append(word_scores[word])
word_maxes = []
for word in word_inventory:
max_score = 0
for num in word_scorelist[word]:
max_score = max(max_score, num)
word_maxes.append([word, max_score])
sorted_maxes = sorted(word_maxes, key=lambda l: l[1], reverse=True)
for x in range(20):
word = sorted_maxes[x][0]
print(word, word_scorelist[word], '\n')
if do_write_out:
with open(out_directory + word_to_find + '.csv', 'w') as csv:
first_line = 'hit'
for x in range(20):
word = sorted_maxes[x][0]
first_line += ',' + word
csv.write(first_line)
# TODO: loop over num of concordances
csv_lines = []
for x in range(conc_counter):
this_line = str(x+1)
for y in range(20):
word = sorted_maxes[y][0]
this_line += ',' + str(int(3*word_scorelist[word][x]))
csv_lines.append(this_line)
print(this_line)
csv.write('\n' + this_line)
if __name__ == '__main__':
word = input('what word: ')
corpus = input('what corpus: ')
if corpus == 'd':
corpus = './corpus/DasKapitalCleaner.txt'
print('doing ({}) ({})'.format(word, corpus))
marx_do_word_environment(corpus, word)