import networkx as nx import nltk import json from tqdm import tqdm import numpy as np try: from .utils import check_file except ImportError: from utils.utils import check_file __all__ = ['extract_english', 'construct_graph', 'merged_relations'] relation_groups = [ 'atlocation/locatednear', 'capableof', 'causes/causesdesire/*motivatedbygoal', 'createdby', 'desires', 'antonym/distinctfrom', 'hascontext', 'hasproperty', 'hassubevent/hasfirstsubevent/haslastsubevent/hasprerequisite/entails/mannerof', 'isa/instanceof/definedas', 'madeof', 'notcapableof', 'notdesires', 'partof/*hasa', 'relatedto/similarto/synonym', 'usedfor', 'receivesaction', ] merged_relations = [ 'antonym', 'atlocation', 'capableof', 'causes', 'createdby', 'isa', 'desires', 'hassubevent', 'partof', 'hascontext', 'hasproperty', 'madeof', 'notcapableof', 'notdesires', 'receivesaction', 'relatedto', 'usedfor', ] relation_text = [ 'is the antonym of', 'is at location of', 'is capable of', 'causes', 'is created by', 'is a kind of', 'desires', 'has subevent', 'is part of', 'has context', 'has property', 'is made of', 'is not capable of', 'does not desires', 'is', 'is related to', 'is used for', ] def load_merge_relation(): relation_mapping = dict() for line in relation_groups: ls = line.strip().split('/') rel = ls[0] for l in ls: if l.startswith("*"): relation_mapping[l[1:]] = "*" + rel else: relation_mapping[l] = rel return relation_mapping def del_pos(s): """ Deletes part-of-speech encoding from an entity string, if present. :param s: Entity string. :return: Entity string with part-of-speech encoding removed. """ if s.endswith("/n") or s.endswith("/a") or s.endswith("/v") or s.endswith("/r"): s = s[:-2] return s def extract_english(conceptnet_path, output_csv_path, output_vocab_path): """ Reads original conceptnet csv file and extracts all English relations (head and tail are both English entities) into a new file, with the following format for each line: . :return: """ print('extracting English concepts and relations from ConceptNet...') relation_mapping = load_merge_relation() num_lines = sum(1 for line in open(conceptnet_path, 'r', encoding='utf-8')) cpnet_vocab = [] concepts_seen = set() with open(conceptnet_path, 'r', encoding="utf8") as fin, \ open(output_csv_path, 'w', encoding="utf8") as fout: for line in tqdm(fin, total=num_lines): toks = line.strip().split('\t') if toks[2].startswith('/c/en/') and toks[3].startswith('/c/en/'): """ Some preprocessing: - Remove part-of-speech encoding. - Split("/")[-1] to trim the "/c/en/" and just get the entity name, convert all to - Lowercase for uniformity. """ rel = toks[1].split("/")[-1].lower() head = del_pos(toks[2]).split("/")[-1].lower() tail = del_pos(toks[3]).split("/")[-1].lower() if not head.replace("_", "").replace("-", "").isalpha(): continue if not tail.replace("_", "").replace("-", "").isalpha(): continue if rel not in relation_mapping: continue rel = relation_mapping[rel] if rel.startswith("*"): head, tail, rel = tail, head, rel[1:] data = json.loads(toks[4]) fout.write('\t'.join([rel, head, tail, str(data["weight"])]) + '\n') for w in [head, tail]: if w not in concepts_seen: concepts_seen.add(w) cpnet_vocab.append(w) with open(output_vocab_path, 'w') as fout: for word in cpnet_vocab: fout.write(word + '\n') print(f'extracted ConceptNet csv file saved to {output_csv_path}') print(f'extracted concept vocabulary saved to {output_vocab_path}') print() def construct_graph(cpnet_csv_path, cpnet_vocab_path, output_path, prune=True): print('generating ConceptNet graph file...') nltk.download('stopwords', quiet=True) nltk_stopwords = nltk.corpus.stopwords.words('english') nltk_stopwords += ["like", "gone", "did", "going", "would", "could", "get", "in", "up", "may", "wanter"] # issue: mismatch with the stop words in grouding.py blacklist = set(["uk", "us", "take", "make", "object", "person", "people"]) # issue: mismatch with the blacklist in grouding.py concept2id = {} id2concept = {} with open(cpnet_vocab_path, "r", encoding="utf8") as fin: id2concept = [w.strip() for w in fin] concept2id = {w: i for i, w in enumerate(id2concept)} id2relation = merged_relations relation2id = {r: i for i, r in enumerate(id2relation)} graph = nx.MultiDiGraph() nrow = sum(1 for _ in open(cpnet_csv_path, 'r', encoding='utf-8')) with open(cpnet_csv_path, "r", encoding="utf8") as fin: def not_save(cpt): if cpt in blacklist: return True '''originally phrases like "branch out" would not be kept in the graph''' # for t in cpt.split("_"): # if t in nltk_stopwords: # return True return False attrs = set() for line in tqdm(fin, total=nrow): ls = line.strip().split('\t') rel = relation2id[ls[0]] subj = concept2id[ls[1]] obj = concept2id[ls[2]] weight = float(ls[3]) if prune and (not_save(ls[1]) or not_save(ls[2]) or id2relation[rel] == "hascontext"): continue # if id2relation[rel] == "relatedto" or id2relation[rel] == "antonym": # weight -= 0.3 # continue if subj == obj: # delete loops continue # weight = 1 + float(math.exp(1 - weight)) # issue: ??? if (subj, obj, rel) not in attrs: graph.add_edge(subj, obj, rel=rel, weight=weight) attrs.add((subj, obj, rel)) graph.add_edge(obj, subj, rel=rel + len(relation2id), weight=weight) attrs.add((obj, subj, rel + len(relation2id))) nx.write_gpickle(graph, output_path) print(f"graph file saved to {output_path}") print() def glove_init(input, output, concept_file): embeddings_file = output + '.npy' vocabulary_file = output.split('.')[0] + '.vocab.txt' output_dir = '/'.join(output.split('/')[:-1]) output_prefix = output.split('/')[-1] words = [] vectors = [] vocab_exist = check_file(vocabulary_file) print("loading embedding") with open(input, 'rb') as f: for line in f: fields = line.split() if len(fields) <= 2: continue if not vocab_exist: word = fields[0].decode('utf-8') words.append(word) vector = np.fromiter((float(x) for x in fields[1:]), dtype=np.float) vectors.append(vector) dim = vector.shape[0] print("converting") matrix = np.array(vectors, dtype="float32") print("writing") np.save(embeddings_file, matrix) text = '\n'.join(words) if not vocab_exist: with open(vocabulary_file, 'wb') as f: f.write(text.encode('utf-8')) def load_glove_from_npy(glove_vec_path, glove_vocab_path): vectors = np.load(glove_vec_path) with open(glove_vocab_path, "r", encoding="utf8") as f: vocab = [l.strip() for l in f.readlines()] assert (len(vectors) == len(vocab)) glove_embeddings = {} for i in range(0, len(vectors)): glove_embeddings[vocab[i]] = vectors[i] print("Read " + str(len(glove_embeddings)) + " glove vectors.") return glove_embeddings def weighted_average(avg, new, n): # TODO: maybe a better name for this function? return ((n - 1) / n) * avg + (new / n) def max_pooling(old, new): # TODO: maybe a better name for this function? return np.maximum(old, new) def write_embeddings_npy(embeddings, embeddings_cnt, npy_path, vocab_path): words = [] vectors = [] for key, vec in embeddings.items(): words.append(key) vectors.append(vec) matrix = np.array(vectors, dtype="float32") print(matrix.shape) print("Writing embeddings matrix to " + npy_path, flush=True) np.save(npy_path, matrix) print("Finished writing embeddings matrix to " + npy_path, flush=True) if not check_file(vocab_path): print("Writing vocab file to " + vocab_path, flush=True) to_write = ["\t".join([w, str(embeddings_cnt[w])]) for w in words] with open(vocab_path, "w", encoding="utf8") as f: f.write("\n".join(to_write)) print("Finished writing vocab file to " + vocab_path, flush=True) def create_embeddings_glove(pooling="max", dim=100): print("Pooling: " + pooling) with open(concept_file, "r", encoding="utf8") as f: triple_str_json = json.load(f) print("Loaded " + str(len(triple_str_json)) + " triple strings.") glove_embeddings = load_glove_from_npy(embeddings_file, vocabulary_file) print("Loaded glove.", flush=True) concept_embeddings = {} concept_embeddings_cnt = {} rel_embeddings = {} rel_embeddings_cnt = {} for i in tqdm(range(len(triple_str_json))): data = triple_str_json[i] words = data["string"].strip().split(" ") rel = data["rel"] subj_start = data["subj_start"] subj_end = data["subj_end"] obj_start = data["obj_start"] obj_end = data["obj_end"] subj_words = words[subj_start:subj_end] obj_words = words[obj_start:obj_end] subj = " ".join(subj_words) obj = " ".join(obj_words) # counting the frequency (only used for the avg pooling) if subj not in concept_embeddings: concept_embeddings[subj] = np.zeros((dim,)) concept_embeddings_cnt[subj] = 0 concept_embeddings_cnt[subj] += 1 if obj not in concept_embeddings: concept_embeddings[obj] = np.zeros((dim,)) concept_embeddings_cnt[obj] = 0 concept_embeddings_cnt[obj] += 1 if rel not in rel_embeddings: rel_embeddings[rel] = np.zeros((dim,)) rel_embeddings_cnt[rel] = 0 rel_embeddings_cnt[rel] += 1 if pooling == "avg": subj_encoding_sum = sum([glove_embeddings.get(word, np.zeros((dim,))) for word in subj]) obj_encoding_sum = sum([glove_embeddings.get(word, np.zeros((dim,))) for word in obj]) if rel in ["relatedto", "antonym"]: # Symmetric relation. rel_encoding_sum = sum([glove_embeddings.get(word, np.zeros((dim,))) for word in words]) - subj_encoding_sum - obj_encoding_sum else: # Asymmetrical relation. rel_encoding_sum = obj_encoding_sum - subj_encoding_sum subj_len = subj_end - subj_start obj_len = obj_end - obj_start subj_encoding = subj_encoding_sum / subj_len obj_encoding = obj_encoding_sum / obj_len rel_encoding = rel_encoding_sum / (len(words) - subj_len - obj_len) concept_embeddings[subj] = subj_encoding concept_embeddings[obj] = obj_encoding rel_embeddings[rel] = weighted_average(rel_embeddings[rel], rel_encoding, rel_embeddings_cnt[rel]) elif pooling == "max": subj_encoding = np.amax([glove_embeddings.get(word, np.zeros((dim,))) for word in subj_words], axis=0) obj_encoding = np.amax([glove_embeddings.get(word, np.zeros((dim,))) for word in obj_words], axis=0) mask_rel = [] for j in range(len(words)): if subj_start <= j < subj_end or obj_start <= j < obj_end: continue mask_rel.append(j) rel_vecs = [glove_embeddings.get(words[i], np.zeros((dim,))) for i in mask_rel] rel_encoding = np.amax(rel_vecs, axis=0) # here it is actually avg over max for relation concept_embeddings[subj] = max_pooling(concept_embeddings[subj], subj_encoding) concept_embeddings[obj] = max_pooling(concept_embeddings[obj], obj_encoding) rel_embeddings[rel] = weighted_average(rel_embeddings[rel], rel_encoding, rel_embeddings_cnt[rel]) print(str(len(concept_embeddings)) + " concept embeddings") print(str(len(rel_embeddings)) + " relation embeddings") write_embeddings_npy(concept_embeddings, concept_embeddings_cnt, f'{output_dir}/concept.{output_prefix}.{pooling}.npy', f'{output_dir}/concept.glove.{pooling}.txt') write_embeddings_npy(rel_embeddings, rel_embeddings_cnt, f'{output_dir}/relation.{output_prefix}.{pooling}.npy', f'{output_dir}/relation.glove.{pooling}.txt') create_embeddings_glove(dim=dim) if __name__ == "__main__": glove_init("../data/glove/glove.6B.200d.txt", "../data/glove/glove.200d", '../data/glove/tp_str_corpus.json')