Add word2vec embeddings

timbeiko · Oct 27, 2017 · da98622 · da98622
1 parent a56c1e9
commit da98622
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -1,23 +1,21 @@
 # Classify Coherence
 An attempt to classify sentences from the Penn Discourse Treebank as either coherent or incoherent. 
 
-## TODO
-- word2vec embeddings 
 
 ### Getting Started
-Hopefully, everything should run and there should be no funky dependencies. 
-Because the PDTB data is not available freely, it has not been uploaded to this repo. 
-
 For the code to run, you will need a data folder with the following subdirectories:
 
 ```
 data/integers
     /json
     /padded
     /txt
+    /model
 ```
 Once you have these, you will need to add a PDTB `relations-XX-XX-XX-{dev | train | test}.json` file to the `/data` directory, and update the value of `relations_json` in `generate_sentences.py` (declared around line 10) to the name of that file. 
 
+In order to train with the Google News word2vec embeddings, you will need to download them (available here: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM), and unzip them in the `\model` directory. 
+
 ### [in progress] Detailed Report
 A detailled (in progress) report of this project can be found at https://www.overleaf.com/read/ngfcbdxkcgby
 

diff --git a/cnn.py b/cnn.py
@@ -1,5 +1,3 @@
-# From https://github.com/dennybritz/cnn-text-classification-tf/blob/master/text_cnn.py
-
 import tensorflow as tf
 import numpy as np
 

diff --git a/cnn.pyc b/cnn.pyc
diff --git a/train.py b/train.py
@@ -1,10 +1,9 @@
-# Flags and training loop from https://github.com/dennybritz/cnn-text-classification-tf/blob/master/train.py
 import tensorflow as tf
 import numpy as np
 import os
 import time
 import datetime
-from cnn import TextCNN
+from cnn import CNN
 from tensorflow.contrib import learn
 
 # Flags
@@ -15,7 +14,8 @@
 tf.flags.DEFINE_string("incoherent_data_file", "./data/padded/incoherent_sentences_arg2_diff_sense.txt", "Data source for the incoherent data.")
 
 # Model Hyperparameters
-tf.flags.DEFINE_integer("embedding_dim", 128, "Dimensionality of character embedding (default: 128)")
+tf.flags.DEFINE_string("word2vec", "./data/model/GoogleNews-vectors-negative300.bin", "Word2vec file with pre-trained embeddings (default: None)")
+tf.flags.DEFINE_integer("embedding_dim", 300, "Dimensionality of character embedding (default: 300, to match GoogleNews embeddings)")
 tf.flags.DEFINE_string("filter_sizes", "4", "Comma-separated filter sizes (default: '3,4,5')")
 tf.flags.DEFINE_integer("num_filters", 32, "Number of filters per filter size (default: 128)")
 tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
@@ -143,6 +143,32 @@
         # Initialize all variables
         sess.run(tf.global_variables_initializer())
 
+        if FLAGS.word2vec:
+            # initial matrix with random uniform
+            initW = np.random.uniform(-0.25,0.25,(len(vocab_processor.vocabulary_), FLAGS.embedding_dim))
+            # load any vectors from the word2vec
+            print("Load word2vec file {}\n".format(FLAGS.word2vec))
+            with open(FLAGS.word2vec, "rb") as f:
+                header = f.readline()
+                vocab_size, layer1_size = map(int, header.split())
+                binary_len = np.dtype('float32').itemsize * layer1_size
+                for line in xrange(vocab_size):
+                    word = []
+                    while True:
+                        ch = f.read(1)
+                        if ch == ' ':
+                            word = ''.join(word)
+                            break
+                        if ch != '\n':
+                            word.append(ch)   
+                    idx = vocab_processor.vocabulary_.get(word)
+                    if idx != 0:
+                        initW[idx] = np.fromstring(f.read(binary_len), dtype='float32')  
+                    else:
+                        f.read(binary_len)    
+
+            sess.run(cnn.W.assign(initW))
+
         def train_step(x_batch, y_batch):
             """
             A single training step