|
| 1 | +from prepro_util import Chunker |
| 2 | +import util |
| 3 | +from gensim.models import Doc2Vec |
| 4 | +import numpy as np |
| 5 | + |
| 6 | + |
| 7 | +def load_doc2vec(path = '/Users/sevgili/Ozge-PhD/wikipedia-doc2vec/all-dim100/wikipedia_document_dim100_with_wikicorpus.doc2vec'): |
| 8 | + return Doc2Vec.load(path, mmap='r') |
| 9 | + |
| 10 | + |
| 11 | +class ContextVecCreator(object): |
| 12 | + def __init__(self): |
| 13 | + self.doc2vec = load_doc2vec() |
| 14 | + self.url2graphid = util.load_url2graphid() |
| 15 | + self.chunker = Chunker() |
| 16 | + self.contextid2chunkid_file = open('idmaps/contextid2chunkid.txt', 'a') |
| 17 | + self.chunkid2contextid_file = open('idmaps/chunkid2contextid.txt', 'a') |
| 18 | + |
| 19 | + def chunk2contextvec(self, chunk, context_id): |
| 20 | + chunk_id, chunk_words, begin_gm, end_gm, ground_truth = chunk |
| 21 | + context = ' '.join(chunk_words) |
| 22 | + context_vec = self.doc2vec.infer_vector(context) |
| 23 | + |
| 24 | + self.contextid2chunkid_file.write(str(context_id) + '\t' + str(chunk_id) + '\n') |
| 25 | + self.chunkid2contextid_file.write(str(chunk_id) + '\t' + str(context_id) + '\n') |
| 26 | + |
| 27 | + return context_vec |
| 28 | + |
| 29 | + # create context vec for each context in dataset: |
| 30 | + def create_contextvec(self, dataset_file_paths, dataset_ttl_paths): |
| 31 | + context_embeds = list() |
| 32 | + context_id = 0 |
| 33 | + |
| 34 | + for path in dataset_file_paths: |
| 35 | + for chunk in self.chunker.process(path): |
| 36 | + context_vec = self.chunk2contextvec(chunk, context_id) |
| 37 | + |
| 38 | + context_embeds.append(context_vec) |
| 39 | + context_id += 1 |
| 40 | + |
| 41 | + for path in dataset_ttl_paths: |
| 42 | + for chunk in self.chunker.process_ttl(path, self.url2graphid): |
| 43 | + context_vec = self.chunk2contextvec(chunk, context_id) |
| 44 | + |
| 45 | + context_embeds.append(context_vec) |
| 46 | + context_id += 1 |
| 47 | + |
| 48 | + np.save(file='vectors/context_vecs.npy', arr=np.array(context_embeds)) |
| 49 | + |
| 50 | + |
| 51 | +if __name__ == "__main__": |
| 52 | + dataset_file_paths=['/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/ace2004.txt', |
| 53 | + '/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aida_dev.txt', |
| 54 | + '/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aida_test.txt', |
| 55 | + '/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aida_train.txt', |
| 56 | + '/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aquaint.txt', |
| 57 | + '/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/clueweb.txt', |
| 58 | + '/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/msnbc.txt'] |
| 59 | + dataset_ttl_paths=['/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl', |
| 60 | + '/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl', |
| 61 | + '/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl', |
| 62 | + '/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl', |
| 63 | + '/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl'] |
| 64 | + |
| 65 | + creator = ContextVecCreator() |
| 66 | + creator.create_contextvec(dataset_file_paths, dataset_ttl_paths) |
| 67 | + |
0 commit comments