Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit d27e4e6

Browse filesBrowse files
committed
Create and save context vectors for each context in the datasets.
1 parent ae2d929 commit d27e4e6
Copy full SHA for d27e4e6

File tree

Expand file treeCollapse file tree

4 files changed

+5375
-0
lines changed
Open diff view settings
Filter options
Expand file treeCollapse file tree

4 files changed

+5375
-0
lines changed
Open diff view settings
Collapse file
+67Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from prepro_util import Chunker
2+
import util
3+
from gensim.models import Doc2Vec
4+
import numpy as np
5+
6+
7+
def load_doc2vec(path = '/Users/sevgili/Ozge-PhD/wikipedia-doc2vec/all-dim100/wikipedia_document_dim100_with_wikicorpus.doc2vec'):
8+
return Doc2Vec.load(path, mmap='r')
9+
10+
11+
class ContextVecCreator(object):
12+
def __init__(self):
13+
self.doc2vec = load_doc2vec()
14+
self.url2graphid = util.load_url2graphid()
15+
self.chunker = Chunker()
16+
self.contextid2chunkid_file = open('idmaps/contextid2chunkid.txt', 'a')
17+
self.chunkid2contextid_file = open('idmaps/chunkid2contextid.txt', 'a')
18+
19+
def chunk2contextvec(self, chunk, context_id):
20+
chunk_id, chunk_words, begin_gm, end_gm, ground_truth = chunk
21+
context = ' '.join(chunk_words)
22+
context_vec = self.doc2vec.infer_vector(context)
23+
24+
self.contextid2chunkid_file.write(str(context_id) + '\t' + str(chunk_id) + '\n')
25+
self.chunkid2contextid_file.write(str(chunk_id) + '\t' + str(context_id) + '\n')
26+
27+
return context_vec
28+
29+
# create context vec for each context in dataset:
30+
def create_contextvec(self, dataset_file_paths, dataset_ttl_paths):
31+
context_embeds = list()
32+
context_id = 0
33+
34+
for path in dataset_file_paths:
35+
for chunk in self.chunker.process(path):
36+
context_vec = self.chunk2contextvec(chunk, context_id)
37+
38+
context_embeds.append(context_vec)
39+
context_id += 1
40+
41+
for path in dataset_ttl_paths:
42+
for chunk in self.chunker.process_ttl(path, self.url2graphid):
43+
context_vec = self.chunk2contextvec(chunk, context_id)
44+
45+
context_embeds.append(context_vec)
46+
context_id += 1
47+
48+
np.save(file='vectors/context_vecs.npy', arr=np.array(context_embeds))
49+
50+
51+
if __name__ == "__main__":
52+
dataset_file_paths=['/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/ace2004.txt',
53+
'/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aida_dev.txt',
54+
'/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aida_test.txt',
55+
'/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aida_train.txt',
56+
'/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/aquaint.txt',
57+
'/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/clueweb.txt',
58+
'/Users/sevgili/PycharmProjects/end2end_neural_el/data/new_datasets/msnbc.txt']
59+
dataset_ttl_paths=['/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl',
60+
'/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl',
61+
'/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl',
62+
'/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl',
63+
'/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl']
64+
65+
creator = ContextVecCreator()
66+
creator.create_contextvec(dataset_file_paths, dataset_ttl_paths)
67+

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.