reports

cadia-lvl · Nov 3, 2023 · 2880ebb · 2880ebb
1 parent de1af30
commit 2880ebb
Show file tree

Hide file tree

Showing 7 changed files with 414 additions and 103 deletions.
diff --git a/src/BaselineClassifiersBinary.ipynb b/src/BaselineClassifiersBinary.ipynb
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 14,
    "id": "3ef9160a",
    "metadata": {},
    "outputs": [
@@ -99,15 +99,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LogisticRegression())])\n",
+      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB())])\n",
       "              precision    recall  f1-score   support\n",
       "\n",
-      "           0     0.8856    0.8657    0.8756      8208\n",
-      "           1     0.8700    0.8893    0.8795      8292\n",
+      "           0     0.8429    0.8623    0.8525      8208\n",
+      "           1     0.8605    0.8409    0.8506      8292\n",
       "\n",
-      "    accuracy                         0.8776     16500\n",
-      "   macro avg     0.8778    0.8775    0.8775     16500\n",
-      "weighted avg     0.8777    0.8776    0.8776     16500\n",
+      "    accuracy                         0.8516     16500\n",
+      "   macro avg     0.8517    0.8516    0.8516     16500\n",
+      "weighted avg     0.8518    0.8516    0.8516     16500\n",
       "\n",
       "DATASET ../IMDB-Dataset-GoogleTranslate-proccessed-nefnir.csv\n"
      ]
@@ -124,15 +124,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LogisticRegression())])\n",
+      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB())])\n",
       "              precision    recall  f1-score   support\n",
       "\n",
-      "           0     0.8958    0.8762    0.8859      8229\n",
-      "           1     0.8794    0.8986    0.8889      8271\n",
+      "           0     0.8477    0.8714    0.8594      8229\n",
+      "           1     0.8684    0.8443    0.8562      8271\n",
       "\n",
-      "    accuracy                         0.8874     16500\n",
-      "   macro avg     0.8876    0.8874    0.8874     16500\n",
-      "weighted avg     0.8876    0.8874    0.8874     16500\n",
+      "    accuracy                         0.8578     16500\n",
+      "   macro avg     0.8581    0.8579    0.8578     16500\n",
+      "weighted avg     0.8581    0.8578    0.8578     16500\n",
       "\n",
       "DATASET ../IMDB-Dataset-Processed.csv\n"
      ]
@@ -149,15 +149,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LogisticRegression())])\n",
+      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB())])\n",
       "              precision    recall  f1-score   support\n",
       "\n",
-      "           0     0.9022    0.8815    0.8917      8208\n",
-      "           1     0.8853    0.9055    0.8952      8292\n",
+      "           0     0.8462    0.8732    0.8595      8208\n",
+      "           1     0.8704    0.8429    0.8564      8292\n",
       "\n",
-      "    accuracy                         0.8935     16500\n",
-      "   macro avg     0.8938    0.8935    0.8935     16500\n",
-      "weighted avg     0.8937    0.8935    0.8935     16500\n",
+      "    accuracy                         0.8579     16500\n",
+      "   macro avg     0.8583    0.8580    0.8579     16500\n",
+      "weighted avg     0.8583    0.8579    0.8579     16500\n",
       "\n",
       "DATASET None\n"
      ]
@@ -174,15 +174,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LogisticRegression())])\n",
+      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB())])\n",
       "              precision    recall  f1-score   support\n",
       "\n",
-      "           0     0.4213    0.4637    0.4415       179\n",
-      "           1     0.8950    0.8777    0.8862       932\n",
+      "           0     0.2401    0.6089    0.3444       179\n",
+      "           1     0.8935    0.6298    0.7388       932\n",
       "\n",
-      "    accuracy                         0.8110      1111\n",
-      "   macro avg     0.6581    0.6707    0.6639      1111\n",
-      "weighted avg     0.8187    0.8110    0.8146      1111\n",
+      "    accuracy                         0.6265      1111\n",
+      "   macro avg     0.5668    0.6194    0.5416      1111\n",
+      "weighted avg     0.7882    0.6265    0.6753      1111\n",
       "\n",
       "DATASET None\n"
      ]
@@ -207,15 +207,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', LogisticRegression())])\n",
+      "Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB())])\n",
       "              precision    recall  f1-score   support\n",
       "\n",
-      "           0     0.4439    0.4637    0.4536       179\n",
-      "           1     0.8961    0.8884    0.8922       932\n",
+      "           0     0.2550    0.5698    0.3523       179\n",
+      "           1     0.8917    0.6803    0.7718       932\n",
       "\n",
-      "    accuracy                         0.8200      1111\n",
-      "   macro avg     0.6700    0.6760    0.6729      1111\n",
-      "weighted avg     0.8232    0.8200    0.8216      1111\n",
+      "    accuracy                         0.6625      1111\n",
+      "   macro avg     0.5734    0.6250    0.5620      1111\n",
+      "weighted avg     0.7891    0.6625    0.7042      1111\n",
       "\n"
      ]
     },

diff --git a/src/FinalReport.ipynb b/src/FinalReport.ipynb
@@ -5,7 +5,7 @@
    "id": "91ff7871",
    "metadata": {},
    "source": [
-    "# Sentiment Analysis on Icelandic corpus using Neural Networks and Machine Learning Classifiers\n",
+    "# Sentiment Analysis on Icelandic text using Neural Networks and Machine Learning Classifiers\n",
     "\n",
     "Nemendur\n",
     "- Ólafur Aron Jóhannsson, Eysteinn Örn, Birkir Arndal\n",
@@ -27,9 +27,7 @@
     "\n",
     "## Previous Work\n",
     "\n",
-    "The topic of machine translating English text into low-resource corpus and examining what impact is has on sentiment classification has garnered considerate research focus for various low-resource languages [1] [3] [4]. However, Icelandic remains relatively underexplored within this domain [2].\n",
-    "\n",
-    "Pang et al. [5] employ supervised learning techniques to categorize movie reviews at the document level as either having a positive or negative sentiment. They utilize a dataset of movie reviews that have already been labeled as positive or negative, which serves as their training data for various established machine learning algorithms. The features considered in their analysis encompass unigrams, bigrams, and additional information like part-of-speech tags. The achieved accuracy in their experiments ranges from 72% to 83%. \n",
+    "The topic of machine translating English text into low-resource corpus and examining what impact is has on sentiment classification has garnered considerate research focus for various low-resource languages [1] [3] [4] [5]. However, Icelandic remains relatively underexplored within this domain [2].\n",
     "\n",
     "## Introduction\n",
     "\n",
@@ -92,6 +90,7 @@
     "Pre-processing is the act of transforming raw data to a form that can be used for the next part of the machine learning process. \n",
     "\n",
     "Here are the preprocessing steps that we used:\n",
+    "\n",
     "- Data cleaning: Removing errors and irrelevant data.\n",
     "- Tokenization: Breaks sentences or paragraphs into individual words. \n",
     "- Lower casing: Helps normalize and reduce dimensionality of the dataset.\n",
@@ -182,7 +181,7 @@
     "| NB negative               |  25.50    | 56.98  |  35.23   |\n",
     "| NB positive               |  89.17    | 68.03  |  77.18   |\n",
     "\n",
-    "When we trained the class it gives us a list of coefficients that represent the relationship between the input variables and the output variable in the model. The coefficient can be interpreted as the relative importance of the word it's classified to, in this case negative or positive. In this chart we can see the top 10 negative and positive values, for a sentence to be positive in this case, it has to have a value of one.\n",
+    "When we trained the class it gives us a list of coefficients that represent the relationship between the input variables and the output variable in the model. The coefficient can be interpreted as the relative importance of the word it's classified to, in this case negative or positive. In this chart we can see the top 5 negative and positive values.\n",
     "\n",
     "| Most important features SVC                    |\n",
     "|------------------------------------------------|\n",
@@ -203,9 +202,9 @@
     "\n",
     "## Conclusion of Machine Learning Classifiers\n",
     "\n",
-    "These figures suggest that sentiment analysis can carry across Machine Translation when utilizing state-of-the-art machine translation APIs such as Support Vector Classifier or Logistic Regression. The loss in accuracy during translation is minimal, with only a 1.53% and 0.65% drop in accuracy with the IMDB reviews, favoring Google's performance using Support Vector Classifiers.\n",
+    "These figures suggest that sentiment analysis can carry across Machine Translation when utilizing state-of-the-art machine learning APIs such as Support Vector Classifier or Logistic Regression. The loss in accuracy during translation is minimal, with 0.6%~ drop in accuracy for Google and 1.5%~ for Miðeind with the IMDB reviews, favoring Google's performance using Support Vector Classifiers.\n",
     "\n",
-    "When using the classifiers on the hand-written reviews from Hannes we noticed that Logistic Regression trained on the dataset from Miðeind Vélþýðing gave the best performance.\n",
+    "When using the classifiers on the hand-written reviews from Hannes we noticed that Logistic Regression trained on the dataset from Miðeind Vélþýðing gave the best performance, 45.36% for negative and 89.22% for positive.\n",
     "\n",
     "This gives us the conclusion that even though Classifiers trained using Google Translated text is best at evaluating it's own test dataset, it seems that Miðeind Vélþýðing trained using Logistic Regression performs the best on text that is hand-written by a native Icelander."
    ]
@@ -244,44 +243,45 @@
    "source": [
     "## Neural Network Prediction Results\n",
     "\n",
-    "classification roberta-batch-8-unprocessed-model on IMDB-dataset.csv\n",
+    "roberta-batch-8-unprocessed-model on IMDB-dataset.csv\n",
     "\n",
     "| RoBERTa English     | Precision | Recall | F1-Score |\n",
-    "|-----------------------|-----------|--------|----------|\n",
-    "| negative              |  95.75     | 93.90   | 94.81     |\n",
-    "| positive              |  94.99     | 95.89   | 94.99     |\n",
-    "\n",
-    "\n",
-    "classification icebert-google-batch8-remove-noise-model on IMDB-Dataset-GoogleTranslate.csv\n",
+    "|---------------------|-----------|--------|----------|\n",
+    "| negative            |  95.75    | 93.90  | 94.81    |\n",
+    "| positive            |  94.99    | 95.89  | 94.99    |\n",
     "\n",
-    "| iceBERT Google     | Precision | Recall | F1-Score |\n",
-    "|-----------------------|-----------|--------|----------|\n",
-    "| negative              |  92.31     | 91.34   | 91.83     |\n",
-    "| positive              |  92.18     | 91.19   | 91.68     |\n",
     "\n",
+    "Performance of Icebert and Electra models \n",
     "\n",
-    "classification icebert-mideind-batch8-remove-noise-model on IMDB-Dataset-MideindTranslate.csv\n",
+    "| Model-[*Train Dataset*] (Sentiment)    | Precision | Recall | F1-Score  |\n",
+    "|-------------------|---------- |--------|-----------|\n",
+    "| IceBERT-[*Miðeind*] (negative)  |   90.76   |  90.56 |  90.66    |\n",
+    "| IceBERT-[*Miðeind*] (positive)  |   90.72   |  90.92 |  90.82    |\n",
+    "| IceBERT-[*Google*]  (negative) |  92.31    | 91.34  |  91.83    |\n",
+    "| IceBERT-[*Google*]  (positive) |  92.18    | 91.19  |  91.68    |\n",
+    "| Electra-[*Miðeind*] (negative)  |   92.44   |  93.50 | **92.97** |\n",
+    "| Electra-[*Miðeind*] (positive)  |   93.42   |  92.36 | **92.89** |\n",
+    "| Electra-[*Google*]  (negative) |   91.89   | 93.28  | 92.58 |\n",
+    "| Electra-[*Google*]  (positive) |   93.18   | 91.77  | 92.47 |\n",
     "\n",
-    "| iceBERT Miðeind     | Precision | Recall | F1-Score |\n",
-    "|-----------------------|-----------|--------|----------|\n",
-    "| negative              |   90.76    |  90.56  |  90.66    |\n",
-    "| positive              |   90.72    |  90.92  |  90.82    |\n",
     "\n",
     "\n",
-    "Loading model from folder ./icebert-google-batch8-remove-noise-model/ using file ../Hannes-Movie-Reviews.csv\n",
     "\n",
-    "| iceBERT Google Hannes Sentiment | Precision | Recall | F1-Score |\n",
-    "|-----------------------|-----------|--------|----------|\n",
-    "| negative              |  67.41    |  33.51  |  44.77   |\n",
-    "| positive              |  88.35    |  96.88  |  92.42   |\n",
+    "Transformer-models predicted against the Hannes dataset\n",
     "\n",
+    "| Model-[*Train Dataset*] (Sentiment) | Precision | Recall | F1-Score  |\n",
+    "|-----------------------------------|-----------|--------|-----------|\n",
+    "| IceBERT-[*Miðeind*] (negative)      |  69.23    |  40.22 | 50.88     |\n",
+    "| IceBERT-[*Miðeind*] (positive)      |  89.37    |  96.56 | 92.89     |\n",
+    "| IceBERT-[*Google*]  (negative)      |  67.41    |  33.51 | 44.77     |\n",
+    "| IceBERT-[*Google*]  (positive)      |  88.35    |  96.88 | 92.42     |\n",
+    "| ELECTRA-[*Miðeind*] (negative)      |  67.44    |  48.60 | **56.49** |\n",
+    "| ELECTRA-[*Miðeind*] (positive)      |  90.63    |  95.49 | **92.99** |\n",
+    "| ELECTRA-[*Google*]  (negative)      |  62.85    |  49.16 | 55.17 |\n",
+    "| ELECTRA-[*Google*]  (positive)      |  90.62    |  94.42 | 92.48 |\n",
     "\n",
-    "Loading model from folder ./IceBERT-mideind-batch8-remove-noise-model/ using file ../Hannes-Movie-Reviews.csv\n",
     "\n",
-    "| iceBERT Miðeind Hannes Sentiment  | Precision | Recall | F1-Score |\n",
-    "|-----------------------|-----------|--------|----------|\n",
-    "| negative              |  69.23     |  40.22  |  50.88  |\n",
-    "| positive              |  89.37     |  96.56  |  92.89  |\n"
+    "\n"
    ]
   },
   {

diff --git a/src/FinalReport.pdf b/src/FinalReport.pdf
diff --git a/src/FinalReport2.ipynb b/src/FinalReport2.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\\begin{center}\n",
+    "\\includegraphics[scale=0.5]{HR_logo_hringur_transparent.png}\n",
+    "\n",
+    "\\LARGE{BSc Final Project} \\\\\n",
+    "\\Large{Department of Computer Science}\n",
+    "\n",
+    "\\hfill\n",
+    "\n",
+    "{\\bfseries\\Huge Sentiment Analysis on Icelandic text using Neural Networks and Machine Learning Classifiers}\n",
+    "\n",
+    "\\hfill\n",
+    "\n",
+    "\\textit{Ólafur Aron Jóhannsson} \\\\\n",
+    "[email protected]\n",
+    "\n",
+    "\\textit{Birkir Arndal} \\\\\n",
+    "[email protected]\n",
+    "\n",
+    "\\textit{Eysteinn Örn} \\\\\n",
+    "[email protected]\n",
+    "\n",
+    "\\hfill\n",
+    "\n",
+    "\n",
+    "\\textit{Supervised by} Stefán Ólafsson and Hrafn Loftsson\n",
+    "\n",
+    "\\hfill\n",
+    "\n",
+    "\n",
+    "2 November, 2023\n",
+    "\n",
+    "\\end{center}\n",
+    "\n",
+    "\\hfill\n",
+    "\n",
+    "\\hfill"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\\begin{center}\n",
+    "\n",
+    "{\\bfseries Abstract}\n",
+    "\n",
+    "\\end{center}\n",
+    "\n",
+    "In this research paper, we evaluate several machine-learning classifiers and Transformer-based language models for Icelandic sentiment analysis. We machine translated English movie reviews from the IMDb dataset to Icelandic and trained three types of classifiers, Support Vector Machines, Logistic Regression and Naive Bayes. We also performed downstream training on three pre-trained transformer-based models RoBERTa, IceBERT and Electra on the original English text and the translated text to evaluate their performance. We found that the Transformer-based models performed better than the machine-learning classifiers on both datasets. The best performing Transformer-based model was the Electra model trained on the Miðeind translated text, which achieved an F1-score of 93%~ on the test set. The best performing machine-learning classifier was the Support Vector Machine, which achieved an accuracy of 89%~ on the test set.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# **Introduction**\n",
+    "\n",
+    "Sentiment Analysis is the process of determining the sentiment of a text. The sentiment of a text can be positive, negative or neutral. Sentiment analysis is a subfield of Natural Language Processing (NLP) and is used in many applications such as social media monitoring, customer service, brand monitoring, and market research. Sentiment analysis is also used in the financial sector to predict stock prices and in politics to predict election results. \n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# **Background and Related Work**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# **Methods**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Results and Analysis"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/Makefile b/src/Makefile
@@ -1,5 +1,7 @@
-pdf:
-	jupyter nbconvert FinalReport.ipynb --to pdf
+pdf2:
+	jupyter nbconvert FinalReport2.ipynb --to pdf --template=hide_header && open FinalReport2.pdf
+pdf3:
+	pandoc -f markdown-implicit_figures -t pdf FinalReport3.md -o FinalReport3.pdf && open FinalReport3.pdf
 
 tex:
-	jupyter nbconvert FinalReport.ipynb --to latex
+	jupyter nbconvert FinalReport2.ipynb --to latex