# _*_ coding: utf-8 _*_ """ python_lda.py by xianhu """ import os import numpy import logging from collections import defaultdict # å ¨å±åé MAX_ITER_NUM = 10000 # æå¤§è¿ä»£æ¬¡æ° VAR_NUM = 20 # èªå¨è®¡ç®è¿ä»£æ¬¡æ°æ¶,è®¡ç®æ¹å·®çåºé´å¤§å° class BiDictionary(object): """ å®ä¹åååå ¸,éè¿keyå¯ä»¥å¾å°value,éè¿valueä¹å¯ä»¥å¾å°key """ def __init__(self): """ :key: åååå ¸åå§å """ self.dict = {} # æ£åçæ°æ®åå ¸,å ¶key为selfçkey self.dict_reversed = {} # ååçæ°æ®åå ¸,å ¶key为selfçvalue return def __len__(self): """ :key: è·ååååå ¸çé¿åº¦ """ return len(self.dict) def __str__(self): """ :key: å°åååå ¸è½¬å为å符串对象 """ str_list = ["%s\t%s" % (key, self.dict[key]) for key in self.dict] return "\n".join(str_list) def clear(self): """ :key: æ¸ ç©ºåååå ¸å¯¹è±¡ """ self.dict.clear() self.dict_reversed.clear() return def add_key_value(self, key, value): """ :key: æ´æ°åååå ¸,å¢å ä¸é¡¹ """ self.dict[key] = value self.dict_reversed[value] = key return def remove_key_value(self, key, value): """ :key: æ´æ°åååå ¸,å é¤ä¸é¡¹ """ if key in self.dict: del self.dict[key] del self.dict_reversed[value] return def get_value(self, key, default=None): """ :key: éè¿keyè·åvalue,ä¸åå¨è¿ådefault """ return self.dict.get(key, default) def get_key(self, value, default=None): """ :key: éè¿valueè·åkey,ä¸åå¨è¿ådefault """ return self.dict_reversed.get(value, default) def contains_key(self, key): """ :key: 夿æ¯å¦åå¨keyå¼ """ return key in self.dict def contains_value(self, value): """ :key: 夿æ¯å¦åå¨valueå¼ """ return value in self.dict_reversed def keys(self): """ :key: å¾å°åååå ¸å ¨é¨çkeys """ return self.dict.keys() def values(self): """ :key: å¾å°åååå ¸å ¨é¨çvalues """ return self.dict_reversed.keys() def items(self): """ :key: å¾å°åååå ¸å ¨é¨çitems """ return self.dict.items() class CorpusSet(object): """ å®ä¹è¯æéç±»,ä½ä¸ºLdaBaseçåºç±» """ def __init__(self): """ :key: åå§å彿° """ # å®ä¹å ³äºwordçåé self.local_bi = BiDictionary() # idåwordä¹é´çæ¬å°åååå ¸,key为id,value为word self.words_count = 0 # æ°æ®éä¸wordçæ°éï¼æéä¹åçï¼ self.V = 0 # æ°æ®éä¸wordçæ°éï¼æéä¹åçï¼ # å®ä¹å ³äºarticleçåé self.artids_list = [] # å ¨é¨articleçidçå表,æç §æ°æ®è¯»åç顺åºåå¨ self.arts_Z = [] # å ¨é¨article䏿æè¯çidä¿¡æ¯,ç»´æ°ä¸º M * art.length() self.M = 0 # æ°æ®éä¸articleçæ°é # å®ä¹æ¨æä¸ç¨å°çåéï¼å¯è½ä¸ºç©ºï¼ self.global_bi = None # idåwordä¹é´çå ¨å±åååå ¸,key为id,value为word self.local_2_global = {} # ä¸ä¸ªåå ¸,localåå ¸åglobalåå ¸ä¹é´ç对åºå ³ç³» return def init_corpus_with_file(self, file_name): """ :key: å©ç¨æ°æ®æä»¶åå§åè¯æéæ°æ®ãæä»¶æ¯ä¸è¡çæ°æ®æ ¼å¼: id[tab]word1 word2 word3...... """ with open(file_name, "r", encoding="utf-8") as file_iter: self.init_corpus_with_articles(file_iter) return def init_corpus_with_articles(self, article_list): """ :key: å©ç¨articleçå表åå§åè¯æéãæ¯ä¸ç¯articleçæ ¼å¼ä¸º: id[tab]word1 word2 word3...... """ # æ¸ çæ°æ®--wordæ°æ® self.local_bi.clear() self.words_count = 0 self.V = 0 # æ¸ çæ°æ®--articleæ°æ® self.artids_list.clear() self.arts_Z.clear() self.M = 0 # æ¸ çæ°æ®--æ¸ çlocalå°globalçæ å°å ³ç³» self.local_2_global.clear() # 读åarticleæ°æ® for line in article_list: frags = line.strip().split() if len(frags) < 2: continue # è·åarticleçid art_id = frags[0].strip() # è·åwordçid art_wordid_list = [] for word in [w.strip() for w in frags[1:] if w.strip()]: local_id = self.local_bi.get_key(word) if self.local_bi.contains_value(word) else len(self.local_bi) # è¿éçself.global_bi为Noneåä¸ºç©ºæ¯æåºå«ç if self.global_bi is None: # æ´æ°idä¿¡æ¯ self.local_bi.add_key_value(local_id, word) art_wordid_list.append(local_id) else: if self.global_bi.contains_value(word): # æ´æ°idä¿¡æ¯ self.local_bi.add_key_value(local_id, word) art_wordid_list.append(local_id) # æ´æ°local_2_global self.local_2_global[local_id] = self.global_bi.get_key(word) # æ´æ°ç±»åé: å¿ é¡»articleä¸wordçæ°é大äº0 if len(art_wordid_list) > 0: self.words_count += len(art_wordid_list) self.artids_list.append(art_id) self.arts_Z.append(art_wordid_list) # åç¸å ³åå§è®¡ç®--wordç¸å ³ self.V = len(self.local_bi) logging.debug("words number: " + str(self.V) + ", " + str(self.words_count)) # åç¸å ³åå§è®¡ç®--articleç¸å ³ self.M = len(self.artids_list) logging.debug("articles number: " + str(self.M)) return def save_wordmap(self, file_name): """ :key: ä¿åwordåå ¸,å³self.local_biçæ°æ® """ with open(file_name, "w", encoding="utf-8") as f_save: f_save.write(str(self.local_bi)) return def load_wordmap(self, file_name): """ :key: å è½½wordåå ¸,å³å è½½self.local_biçæ°æ® """ self.local_bi.clear() with open(file_name, "r", encoding="utf-8") as f_load: for _id, _word in [line.strip().split() for line in f_load if line.strip()]: self.local_bi.add_key_value(int(_id), _word.strip()) self.V = len(self.local_bi) return class LdaBase(CorpusSet): """ LDA模åçåºç±»,ç¸å ³è¯´æ: ãarticleç䏿 èå´ä¸º[0, self.M), 䏿 为 m ãwordidç䏿 èå´ä¸º[0, self.V), 䏿 为 w ãtopicç䏿 èå´ä¸º[0, self.K), 䏿 为 k æ topic ãarticleä¸wordç䏿 èå´ä¸º[0, article.size()), 䏿 为 n """ def __init__(self): """ :key: åå§å彿° """ CorpusSet.__init__(self) # åºç¡åé--1 self.dir_path = "" # æä»¶å¤¹è·¯å¾,ç¨äºåæ¾LDAè¿è¡çæ°æ®ãä¸é´ç»æç self.model_name = "" # LDAè®ç»ææ¨æç模ååç§°,ä¹ç¨äºè¯»åè®ç»çç»æ self.current_iter = 0 # LDAè®ç»ææ¨æç模åå·²ç»è¿ä»£ç次æ°,ç¨äºç»§ç»æ¨¡åè®ç»è¿ç¨ self.iters_num = 0 # LDAè®ç»ææ¨æè¿ç¨ä¸Gibbsæ½æ ·è¿ä»£çæ»æ¬¡æ°,æ´æ°å¼æè "auto" self.topics_num = 0 # LDAè®ç»ææ¨æè¿ç¨ä¸çtopicçæ°é,å³self.Kå¼ self.K = 0 # LDAè®ç»ææ¨æè¿ç¨ä¸çtopicçæ°é,å³self.topics_numå¼ self.twords_num = 0 # LDAè®ç»ææ¨æç»æåè¾åºä¸æ¯ä¸ªtopicç¸å ³çwordçä¸ªæ° # åºç¡åé--2 self.alpha = numpy.zeros(self.K) # è¶ åæ°alpha,Kç»´çfloatå¼,é»è®¤ä¸º50/K self.beta = numpy.zeros(self.V) # è¶ åæ°beta,Vç»´çfloatå¼,é»è®¤ä¸º0.01 # åºç¡åé--3 self.Z = [] # ææwordçtopicä¿¡æ¯,å³Z(m, n),ç»´æ°ä¸º M * article.size() # ç»è®¡è®¡æ°(å¯ç±self.Z计ç®å¾å°) self.nd = numpy.zeros((self.M, self.K)) # nd[m, k]ç¨äºä¿å第mç¯articleä¸ç¬¬k个topic产ççè¯ç个æ°,å ¶ç»´æ°ä¸º M * K self.ndsum = numpy.zeros((self.M, 1)) # ndsum[m, 0]ç¨äºä¿å第mç¯articleçæ»è¯æ°,ç»´æ°ä¸º M * 1 self.nw = numpy.zeros((self.K, self.V)) # nw[k, w]ç¨äºä¿å第k个topic产ççè¯ä¸ç¬¬w个è¯çæ°é,å ¶ç»´æ°ä¸º K * V self.nwsum = numpy.zeros((self.K, 1)) # nwsum[k, 0]ç¨äºä¿å第k个topic产ççè¯çæ»æ°,ç»´æ°ä¸º K * 1 # å¤é¡¹å¼åå¸åæ°åé self.theta = numpy.zeros((self.M, self.K)) # Doc-Topicå¤é¡¹å¼åå¸çåæ°,ç»´æ°ä¸º M * K,ç±alphaå¼å½±å self.phi = numpy.zeros((self.K, self.V)) # Topic-Wordå¤é¡¹å¼åå¸çåæ°,ç»´æ°ä¸º K * V,ç±betaå¼å½±å # è¾ å©åé,ç®çæ¯æé«ç®æ³æ§è¡æç self.sum_alpha = 0.0 # è¶ åæ°alphaçå self.sum_beta = 0.0 # è¶ åæ°betaçå # å éªç¥è¯,æ ¼å¼ä¸º{word_id: [k1, k2, ...], ...} self.prior_word = defaultdict(list) # æ¨ææ¶éè¦çè®ç»æ¨¡å self.train_model = None return # --------------------------------------------------è¾ å©å½æ°--------------------------------------------------------- def init_statistics_document(self): """ :key: åå§åå ³äºarticleçç»è®¡è®¡æ°ãå 峿¡ä»¶: self.M, self.K, self.Z """ assert self.M > 0 and self.K > 0 and self.Z # ç»è®¡è®¡æ°åå§å self.nd = numpy.zeros((self.M, self.K), dtype=numpy.int) self.ndsum = numpy.zeros((self.M, 1), dtype=numpy.int) # æ ¹æ®self.Zè¿è¡æ´æ°,æ´æ°self.nd[m, k]åself.ndsum[m, 0] for m in range(self.M): for k in self.Z[m]: self.nd[m, k] += 1 self.ndsum[m, 0] = len(self.Z[m]) return def init_statistics_word(self): """ :key: åå§åå ³äºwordçç»è®¡è®¡æ°ãå 峿¡ä»¶: self.V, self.K, self.Z, self.arts_Z """ assert self.V > 0 and self.K > 0 and self.Z and self.arts_Z # ç»è®¡è®¡æ°åå§å self.nw = numpy.zeros((self.K, self.V), dtype=numpy.int) self.nwsum = numpy.zeros((self.K, 1), dtype=numpy.int) # æ ¹æ®self.Zè¿è¡æ´æ°,æ´æ°self.nw[k, w]åself.nwsum[k, 0] for m in range(self.M): for k, w in zip(self.Z[m], self.arts_Z[m]): self.nw[k, w] += 1 self.nwsum[k, 0] += 1 return def init_statistics(self): """ :key: åå§åå ¨é¨çç»è®¡è®¡æ°ãä¸ä¸¤ä¸ªå½æ°ç综å彿°ã """ self.init_statistics_document() self.init_statistics_word() return def sum_alpha_beta(self): """ :key: 计ç®alphaãbetaçå """ self.sum_alpha = self.alpha.sum() self.sum_beta = self.beta.sum() return def calculate_theta(self): """ :key: åå§åå¹¶è®¡ç®æ¨¡åçthetaå¼(M*K),ç¨å°alphaå¼ """ assert self.sum_alpha > 0 self.theta = (self.nd + self.alpha) / (self.ndsum + self.sum_alpha) return def calculate_phi(self): """ :key: åå§åå¹¶è®¡ç®æ¨¡åçphiå¼(K*V),ç¨å°betaå¼ """ assert self.sum_beta > 0 self.phi = (self.nw + self.beta) / (self.nwsum + self.sum_beta) return # ---------------------------------------------计ç®Perplexityå¼------------------------------------------------------ def calculate_perplexity(self): """ :key: 计ç®Perplexityå¼,å¹¶è¿å """ # 计ç®thetaåphiå¼ self.calculate_theta() self.calculate_phi() # å¼å§è®¡ç® preplexity = 0.0 for m in range(self.M): for w in self.arts_Z[m]: preplexity += numpy.log(numpy.sum(self.theta[m] * self.phi[:, w])) return numpy.exp(-(preplexity / self.words_count)) # --------------------------------------------------éæå½æ°--------------------------------------------------------- @staticmethod def multinomial_sample(pro_list): """ :key: éæå½æ°,å¤é¡¹å¼å叿½æ ·,æ¤æ¶ä¼æ¹åpro_listçå¼ :param pro_list: [0.2, 0.7, 0.4, 0.1],æ¤æ¶è¯´æè¿å䏿 1çå¯è½æ§å¤§,ä½ä¹ä¸ç»å¯¹ """ # å°pro_listè¿è¡ç´¯å for k in range(1, len(pro_list)): pro_list[k] += pro_list[k-1] # ç¡®å®éæºæ° u è½å¨åªä¸ªä¸æ å¼,æ¤æ¶ç䏿 å¼å³ä¸ºæ½åçç±»å«ï¼random.rand()è¿å: [0, 1.0)ï¼ u = numpy.random.rand() * pro_list[-1] return_index = len(pro_list) - 1 for t in range(len(pro_list)): if pro_list[t] > u: return_index = t break return return_index # ----------------------------------------------Gibbsæ½æ ·ç®æ³-------------------------------------------------------- def gibbs_sampling(self, is_calculate_preplexity): """ :key: LDA模åä¸çGibbsæ½æ ·è¿ç¨ :param is_calculate_preplexity: æ¯å¦è®¡ç®preplexityå¼ """ # 计ç®preplexityå¼ç¨å°çåé pp_list = [] pp_var = numpy.inf # å¼å§è¿ä»£ last_iter = self.current_iter + 1 iters_num = self.iters_num if self.iters_num != "auto" else MAX_ITER_NUM for self.current_iter in range(last_iter, last_iter+iters_num): info = "......" # æ¯å¦è®¡ç®preplexityå¼ if is_calculate_preplexity: pp = self.calculate_perplexity() pp_list.append(pp) # 计ç®åè¡¨ææ°VAR_NUMé¡¹çæ¹å·® pp_var = numpy.var(pp_list[-VAR_NUM:]) if len(pp_list) >= VAR_NUM else numpy.inf info = (", preplexity: " + str(pp)) + ((", var: " + str(pp_var)) if len(pp_list) >= VAR_NUM else "") # è¾åºDebugä¿¡æ¯ logging.debug("\titeration " + str(self.current_iter) + info) # 夿æ¯å¦è·³åºå¾ªç¯ if self.iters_num == "auto" and pp_var < (VAR_NUM / 2): break # 对æ¯ç¯articleçæ¯ä¸ªwordè¿è¡ä¸æ¬¡æ½æ ·,æ½ååéçkå¼ for m in range(self.M): for n in range(len(self.Z[m])): w = self.arts_Z[m][n] k = self.Z[m][n] # ç»è®¡è®¡æ°åä¸ self.nd[m, k] -= 1 self.ndsum[m, 0] -= 1 self.nw[k, w] -= 1 self.nwsum[k, 0] -= 1 if self.prior_word and (w in self.prior_word): # 带æå éªç¥è¯,å¦åè¿è¡æ£å¸¸æ½æ · k = numpy.random.choice(self.prior_word[w]) else: # 计ç®thetaå¼--ä¸è¾¹çè¿ç¨ä¸ºæ½å第mç¯articleç第n个è¯wçtopic,峿°çk theta_p = (self.nd[m] + self.alpha) / (self.ndsum[m, 0] + self.sum_alpha) # 计ç®phiå¼--夿æ¯è®ç»æ¨¡å,è¿æ¯æ¨ææ¨¡åï¼æ³¨æself.beta[w_g]ï¼ if self.local_2_global and self.train_model: w_g = self.local_2_global[w] phi_p = (self.train_model.nw[:, w_g] + self.nw[:, w] + self.beta[w_g]) / \ (self.train_model.nwsum[:, 0] + self.nwsum[:, 0] + self.sum_beta) else: phi_p = (self.nw[:, w] + self.beta[w]) / (self.nwsum[:, 0] + self.sum_beta) # multi_p为å¤é¡¹å¼åå¸çåæ°,æ¤æ¶æ²¡æè¿è¡æ åå multi_p = theta_p * phi_p # æ¤æ¶çtopicå³ä¸ºGibbsæ½æ ·å¾å°çtopic,宿è¾å¤§çæ¦çå½ä¸å¤é¡¹å¼æ¦ç大çtopic k = LdaBase.multinomial_sample(multi_p) # ç»è®¡è®¡æ°å ä¸ self.nd[m, k] += 1 self.ndsum[m, 0] += 1 self.nw[k, w] += 1 self.nwsum[k, 0] += 1 # æ´æ°Zå¼ self.Z[m][n] = k # æ½æ ·å®æ¯ return # -----------------------------------------Modelæ°æ®åå¨ã读åç¸å ³å½æ°------------------------------------------------- def save_parameter(self, file_name): """ :key: ä¿å模åç¸å ³åæ°æ°æ®,å æ¬: topics_num, M, V, K, words_count, alpha, beta """ with open(file_name, "w", encoding="utf-8") as f_param: for item in ["topics_num", "M", "V", "K", "words_count"]: f_param.write("%s\t%s\n" % (item, str(self.__dict__[item]))) f_param.write("alpha\t%s\n" % ",".join([str(item) for item in self.alpha])) f_param.write("beta\t%s\n" % ",".join([str(item) for item in self.beta])) return def load_parameter(self, file_name): """ :key: å 载模åç¸å ³åæ°æ°æ®,åä¸ä¸ä¸ªå½æ°ç¸å¯¹åº """ with open(file_name, "r", encoding="utf-8") as f_param: for line in f_param: key, value = line.strip().split() if key in ["topics_num", "M", "V", "K", "words_count"]: self.__dict__[key] = int(value) elif key in ["alpha", "beta"]: self.__dict__[key] = numpy.array([float(item) for item in value.split(",")]) return def save_zvalue(self, file_name): """ :key: ä¿å模åå ³äºarticleçåé,å æ¬: arts_Z, Z, artids_listç """ with open(file_name, "w", encoding="utf-8") as f_zvalue: for m in range(self.M): out_line = [str(w) + ":" + str(k) for w, k in zip(self.arts_Z[m], self.Z[m])] f_zvalue.write(self.artids_list[m] + "\t" + " ".join(out_line) + "\n") return def load_zvalue(self, file_name): """ :key: è¯»åæ¨¡åçZåéãåä¸ä¸ä¸ªå½æ°ç¸å¯¹åº """ self.arts_Z = [] self.artids_list = [] self.Z = [] with open(file_name, "r", encoding="utf-8") as f_zvalue: for line in f_zvalue: frags = line.strip().split() art_id = frags[0].strip() w_k_list = [value.split(":") for value in frags[1:]] # æ·»å å°ç±»ä¸ self.artids_list.append(art_id) self.arts_Z.append([int(item[0]) for item in w_k_list]) self.Z.append([int(item[1]) for item in w_k_list]) return def save_twords(self, file_name): """ :key: ä¿å模åçtwordsæ°æ®,è¦ç¨å°phiçæ°æ® """ self.calculate_phi() out_num = self.V if self.twords_num > self.V else self.twords_num with open(file_name, "w", encoding="utf-8") as f_twords: for k in range(self.K): words_list = sorted([(w, self.phi[k, w]) for w in range(self.V)], key=lambda x: x[1], reverse=True) f_twords.write("Topic %dth:\n" % k) f_twords.writelines(["\t%s %f\n" % (self.local_bi.get_value(w), p) for w, p in words_list[:out_num]]) return def load_twords(self, file_name): """ :key: å 载模åçtwordsæ°æ®,å³å éªæ°æ® """ self.prior_word.clear() topic = -1 with open(file_name, "r", encoding="utf-8") as f_twords: for line in f_twords: if line.startswith("Topic"): topic = int(line.strip()[6:-3]) else: word_id = self.local_bi.get_key(line.strip().split()[0].strip()) self.prior_word[word_id].append(topic) return def save_tag(self, file_name): """ :key: è¾åºæ¨¡åæç»ç»æ°æ®ææ ç¾çç»æ,ç¨å°thetaå¼ """ self.calculate_theta() with open(file_name, "w", encoding="utf-8") as f_tag: for m in range(self.M): f_tag.write("%s\t%s\n" % (self.artids_list[m], " ".join([str(item) for item in self.theta[m]]))) return def save_model(self): """ :key: ä¿åæ¨¡åæ°æ® """ name_predix = "%s-%05d" % (self.model_name, self.current_iter) # ä¿åè®ç»ç»æ self.save_parameter(os.path.join(self.dir_path, "%s.%s" % (name_predix, "param"))) self.save_wordmap(os.path.join(self.dir_path, "%s.%s" % (name_predix, "wordmap"))) self.save_zvalue(os.path.join(self.dir_path, "%s.%s" % (name_predix, "zvalue"))) #ä¿åé¢å¤æ°æ® self.save_twords(os.path.join(self.dir_path, "%s.%s" % (name_predix, "twords"))) self.save_tag(os.path.join(self.dir_path, "%s.%s" % (name_predix, "tag"))) return def load_model(self): """ :key: å è½½æ¨¡åæ°æ® """ name_predix = "%s-%05d" % (self.model_name, self.current_iter) # å è½½è®ç»ç»æ self.load_parameter(os.path.join(self.dir_path, "%s.%s" % (name_predix, "param"))) self.load_wordmap(os.path.join(self.dir_path, "%s.%s" % (name_predix, "wordmap"))) self.load_zvalue(os.path.join(self.dir_path, "%s.%s" % (name_predix, "zvalue"))) return class LdaModel(LdaBase): """ LDA模åå®ä¹,主è¦å®ç°è®ç»ãç»§ç»è®ç»ãæ¨æçè¿ç¨ """ def init_train_model(self, dir_path, model_name, current_iter, iters_num=None, topics_num=10, twords_num=200, alpha=-1.0, beta=0.01, data_file="", prior_file=""): """ :key: åå§åè®ç»æ¨¡å,æ ¹æ®åæ°current_iterï¼æ¯å¦çäº0ï¼å³å®æ¯åå§åæ°æ¨¡å,è¿æ¯å è½½å·²ææ¨¡å :key: å½åå§åæ°æ¨¡åæ¶,é¤äºprior_fileå éªæä»¶å¤,å ¶ä½ææçåæ°é½éè¦,ä¸current_iterçäº0 :key: å½å è½½å·²ææ¨¡åæ¶,åªéè¦dir_path, model_name, current_iterï¼ä¸çäº0ï¼, iters_num, twords_numå³å¯ :param iters_num: å¯ä»¥ä¸ºæ´æ°å¼æè âautoâ """ if current_iter == 0: logging.debug("init a new train model") # åå§åè¯æé self.init_corpus_with_file(data_file) # åå§åé¨ååé self.dir_path = dir_path self.model_name = model_name self.current_iter = current_iter self.iters_num = iters_num self.topics_num = topics_num self.K = topics_num self.twords_num = twords_num # åå§åalphaåbeta self.alpha = numpy.array([alpha if alpha > 0 else (50.0/self.K) for k in range(self.K)]) self.beta = numpy.array([beta if beta > 0 else 0.01 for w in range(self.V)]) # åå§åZå¼,以便ç»è®¡è®¡æ° self.Z = [[numpy.random.randint(self.K) for n in range(len(self.arts_Z[m]))] for m in range(self.M)] else: logging.debug("init an existed model") # åå§åé¨ååé self.dir_path = dir_path self.model_name = model_name self.current_iter = current_iter self.iters_num = iters_num self.twords_num = twords_num # å è½½å·²ææ¨¡å self.load_model() # åå§åç»è®¡è®¡æ° self.init_statistics() # 计ç®alphaåbetaçåå¼ self.sum_alpha_beta() # åå§åå éªç¥è¯ if prior_file: self.load_twords(prior_file) # è¿å该模å return self def begin_gibbs_sampling_train(self, is_calculate_preplexity=True): """ :key: è®ç»æ¨¡å,å¯¹è¯æéä¸çæææ°æ®è¿è¡Gibbsæ½æ ·,å¹¶ä¿åæåçæ½æ ·ç»æ """ # Gibbsæ½æ · logging.debug("sample iteration start, iters_num: " + str(self.iters_num)) self.gibbs_sampling(is_calculate_preplexity) logging.debug("sample iteration finish") # ä¿å模å logging.debug("save model") self.save_model() return def init_inference_model(self, train_model): """ :key: åå§åæ¨ææ¨¡å """ self.train_model = train_model # åå§ååé: 主è¦ç¨å°self.topics_num, self.K self.topics_num = train_model.topics_num self.K = train_model.K # åå§ååéself.alpha, self.beta,ç´æ¥æ²¿ç¨train_modelçå¼ self.alpha = train_model.alpha # Kç»´çfloatå¼,è®ç»åæ¨ææ¨¡åä¸çKç¸å,æ å¯ä»¥æ²¿ç¨ self.beta = train_model.beta # Vç»´çfloatå¼,æ¨ææ¨¡åä¸ç¨äºè®¡ç®phiçVå¼åºè¯¥æ¯å ¨å±çwordçæ°é,æ å¯ä»¥æ²¿ç¨ self.sum_alpha_beta() # 计ç®alphaåbetaçå # åå§åæ°æ®éçself.global_bi self.global_bi = train_model.local_bi return def inference_data(self, article_list, iters_num=100, repeat_num=3): """ :key: å©ç¨ç°ææ¨¡åæ¨ææ°æ® :param article_list: æ¯ä¸è¡çæ°æ®æ ¼å¼ä¸º: id[tab]word1 word2 word3...... :param iters_num: æ¯ä¸æ¬¡è¿ä»£çæ¬¡æ° :param repeat_num: éå¤è¿ä»£çæ¬¡æ° """ # åå§åè¯æé self.init_corpus_with_articles(article_list) # åå§åè¿ååé return_theta = numpy.zeros((self.M, self.K)) # é夿½æ · for i in range(repeat_num): logging.debug("inference repeat_num: " + str(i+1)) # åå§ååé self.current_iter = 0 self.iters_num = iters_num # åå§åZå¼,以便ç»è®¡è®¡æ° self.Z = [[numpy.random.randint(self.K) for n in range(len(self.arts_Z[m]))] for m in range(self.M)] # åå§åç»è®¡è®¡æ° self.init_statistics() # å¼å§æ¨æ self.gibbs_sampling(is_calculate_preplexity=False) # 计ç®theta self.calculate_theta() return_theta += self.theta # 计ç®ç»æ,å¹¶è¿å return return_theta / repeat_num if __name__ == "__main__": """ æµè¯ä»£ç """ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s\t%(levelname)s\t%(message)s") # trainæè inference test_type = "train" # test_type = "inference" # æµè¯æ°æ¨¡å if test_type == "train": model = LdaModel() # ç±prior_fileå³å®æ¯å¦å¸¦æå éªç¥è¯ model.init_train_model("data/", "model", current_iter=0, iters_num="auto", topics_num=10, data_file="corpus.txt") # model.init_train_model("data/", "model", current_iter=0, iters_num="auto", topics_num=10, data_file="corpus.txt", prior_file="prior.twords") model.begin_gibbs_sampling_train() elif test_type == "inference": model = LdaModel() model.init_inference_model(LdaModel().init_train_model("data/", "model", current_iter=134)) data = [ "cn åªå æ¼«ç» åªå æ¼«ç» æ¼«ç» æ´å åªå æ¼«ç» èµæº å·æ 彿¼« å ¨å½© æ¥æ¼« 宿¶ å¨çº¿ç éå¿ææ¬² ç»é æ¼«ç» èµæº é»ç½ å ¨å½© èªæµ·ç", "co aircloud aircloud 硬件 è®¾å¤ wifi æºè½ æè¦ å¹³æ¿çµè çµè åå¨ aircloud æä»¶ è¿ç¨ åå· aircloud 硬件 è®¾å¤ wifi" ] result = model.inference_data(data) # éåºç¨åº exit()