forked from lmoroney/funcode
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTextClassificationHelper.py
More file actions
230 lines (204 loc) · 10.1 KB
/
TextClassificationHelper.py
File metadata and controls
230 lines (204 loc) · 10.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# Packages to install: tensorflow, beautifulsoup4, wget
import tensorflow as tf
import numpy as np
import wget
import ssl
import os
import zipfile
import string
import pathlib
import json
from bs4 import BeautifulSoup
class TextClassificationHelper:
# Public Class Variables
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
num_epochs = 30
glove_url = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.twitter.27B.25d.zip"
data_url = "https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json"
glove_local = "glove.zip"
data_local = "sarcasm.json"
training_quotient = .8 # How much of the dataset will be used for training
vocab_quotient = .5 # How much of the total vocab will be used
# Private Class Variables
__embedding_dim = 25 # Because GLove is 25 dimensions
__training_labels = []
__testing_labels = []
__training_padded = []
__testing_padded = []
__embedding_matrix = []
__glove_embeddings = dict()
__saved_model_dir = export_dir = 'saved_model/1'
__tflite_model_name = 'model.tflite'
__vocab_size = 0 # The private vocab size that is calculated from the distinct # of words * the vocab quotient
__tokenizer = None
__max_length = None
def create_model(self):
# We'll start by downloading and preparing the glove embeddings into a dicitonary
self.download_and_prepare_glove()
self.download_and_process_data_file()
self.train_model()
def download_and_prepare_glove(self):
# This function downloads the glove embeddings to a local zip
# then extracts them to a .txt file from which it will read
# the embeddings into a dictionary that it then returns to the caller
ssl._create_default_https_context = ssl._create_unverified_context
if os.path.exists(self.glove_local):
print("Glove embeddings already downloaded...")
else:
print("Downloading Glove embeddings...")
wget.download(self.glove_url, self.glove_local)
if os.path.exists("glove/glove.twitter.27B.25d.txt"):
print("Glove embeddings already extracted...")
else:
print("Unzipping Glove embeddings...")
local_zip = self.glove_local
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('glove')
zip_ref.close()
print("Creating dictionary from Glove embeddings...")
f = open('glove/glove.twitter.27B.25d.txt')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
self.__glove_embeddings[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(self.__glove_embeddings))
def download_and_process_data_file(self):
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are",
"as",
"at",
"be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could",
"did",
"do",
"does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has",
"have",
"having",
"he", "hed", "hes", "her", "here", "heres", "hers", "herself", "him", "himself", "his", "how",
"hows", "i", "id", "ill", "im", "ive", "if", "in", "into", "is", "it", "its", "itself",
"lets", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other",
"ought",
"our", "ours", "ourselves", "out", "over", "own", "same", "she", "shed", "shell", "shes",
"should",
"so", "some", "such", "than", "that", "thats", "the", "their", "theirs", "them", "themselves",
"then",
"there", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "this", "those",
"through",
"to", "too", "under", "until", "up", "very", "was", "we", "wed", "well", "were", "weve",
"were",
"what", "whats", "when", "whens", "where", "wheres", "which", "while", "who", "whos", "whom",
"why",
"whys", "with", "would", "you", "youd", "youll", "youre", "youve", "your", "yours", "yourself",
"yourselves"]
table = str.maketrans('', '', string.punctuation)
if os.path.exists(self.data_local):
print("Data already downloaded...")
else:
print("Downloading sarcasm data file...")
wget.download(self.data_url, self.data_local)
print("Preprocessing JSON File...")
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
with open(self.data_local, 'r') as f:
datastore = json.load(f)
sentences = []
labels = []
urls = []
for item in datastore:
sentence = item['headline'].lower()
sentence = sentence.replace(",", " , ")
sentence = sentence.replace(".", " . ")
sentence = sentence.replace("-", " - ")
sentence = sentence.replace("/", " / ")
soup = BeautifulSoup(sentence, features='html.parser')
sentence = soup.get_text()
words = sentence.split()
filtered_sentence = ""
for word in words:
word = word.translate(table)
if word not in stopwords:
filtered_sentence = filtered_sentence + word + " "
sentences.append(filtered_sentence)
labels.append(item['is_sarcastic'])
urls.append(item['article_link'])
print("Creating training and test datasets...")
count, self.__max_length = self.get_distinct_word_count_and_length(sentences)
self.__vocab_size = int(count * self.vocab_quotient)
to_cut = int(len(sentences) * self.training_quotient)
training_sentences = sentences[0:to_cut]
testing_sentences = sentences[to_cut:]
self.__training_labels = labels[0:to_cut]
self.__testing_labels = labels[to_cut:]
self.__tokenizer = Tokenizer(num_words=self.__vocab_size, oov_token=self.oov_tok)
self.__tokenizer.fit_on_texts(training_sentences)
word_index = self.__tokenizer.word_index
print(len(word_index))
training_sequences = self.__tokenizer.texts_to_sequences(training_sentences)
self.__training_padded = pad_sequences(training_sequences, maxlen=self.__max_length, padding=self.padding_type,
truncating=self.trunc_type)
testing_sequences = self.__tokenizer.texts_to_sequences(testing_sentences)
self.__testing_padded = pad_sequences(testing_sequences, maxlen=self.__max_length, padding=self.padding_type,
truncating=self.trunc_type)
xs = []
ys = []
cumulative_x = []
cumulative_y = []
total_y = 0
for word, index in self.__tokenizer.word_index.items():
xs.append(index)
cumulative_x.append(index)
if self.__glove_embeddings.get(word) is not None:
total_y = total_y + 1
ys.append(1)
else:
ys.append(0)
cumulative_y.append(total_y / index)
self.__embedding_matrix = np.zeros((self.__vocab_size, self.__embedding_dim))
for word, index in self.__tokenizer.word_index.items():
if index > self.__vocab_size - 1:
break
else:
embedding_vector = self.__glove_embeddings.get(word)
if embedding_vector is not None:
self.__embedding_matrix[index] = embedding_vector
self.__training_padded = np.array(self.__training_padded)
self.__training_labels = np.array(self.__training_labels)
self.__testing_padded = np.array(self.__testing_padded)
self.__testing_labels = np.array(self.__testing_labels)
def train_model(self):
model = tf.keras.Sequential([
tf.keras.layers.Embedding(self.__vocab_size, self.__embedding_dim, weights=[self.__embedding_matrix],
trainable=False),
tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
tf.keras.layers.Dense(16, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(self.__training_padded, self.__training_labels, epochs=self.num_epochs,
validation_data=(self.__testing_padded, self.__testing_labels), verbose=2)
export_dir = self.__saved_model_dir
tf.saved_model.save(model, export_dir)
# Convert and save the model to tflite
converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
tflite_model = converter.convert()
tflite_model_file = pathlib.Path(self.__tflite_model_name)
tflite_model_file.write_bytes(tflite_model)
# Write the dictionary and metadata for the model
with open('word_dict.json', 'w') as file:
json.dump(self.__tokenizer.word_index, file)
meta_data = {"max_length": self.__max_length}
with open('metadata.json', 'w') as file:
json.dump(meta_data, file)
def get_distinct_word_count_and_length(self, sentences):
from collections import Counter
corpus = ""
sentence_lengths = 0
for sentence in sentences:
corpus = corpus + sentence
sentence_lengths = sentence_lengths + len(sentence)
foo = Counter(corpus.split())
average_length = int(sentence_lengths / len(sentences))
return len(foo.items()), average_length