\n",
" \n",
" 0 | \n",
- " negative | \n",
- " the entrance was the impressive thing about th... | \n",
- " train | \n",
+ " None | \n",
+ " sans serif html background fff color padding p... | \n",
+ " NaN | \n",
"
\n",
" \n",
" 1 | \n",
" negative | \n",
- " i m a mclover , and i had no problem nwith the... | \n",
- " train | \n",
+ " ordered a large mango pineapple smoothie . sta... | \n",
+ " test | \n",
"
\n",
" \n",
" 2 | \n",
- " negative | \n",
- " less than good here , not terrible , but i see... | \n",
- " train | \n",
+ " positive | \n",
+ " quite a surprise ! n nmy wife and i loved this... | \n",
+ " test | \n",
"
\n",
" \n",
" 3 | \n",
" negative | \n",
- " i don t know if i can ever bring myself to go ... | \n",
- " train | \n",
+ " first i will say , this is a nice atmosphere a... | \n",
+ " test | \n",
"
\n",
" \n",
" 4 | \n",
- " negative | \n",
- " food was ok good but the service was terrible ... | \n",
- " train | \n",
+ " positive | \n",
+ " i was overall pretty impressed by this hotel .... | \n",
+ " test | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " rating review split\n",
- "0 negative the entrance was the impressive thing about th... train\n",
- "1 negative i m a mclover , and i had no problem nwith the... train\n",
- "2 negative less than good here , not terrible , but i see... train\n",
- "3 negative i don t know if i can ever bring myself to go ... train\n",
- "4 negative food was ok good but the service was terrible ... train"
+ " rating review split\n",
+ "0 None sans serif html background fff color padding p... NaN\n",
+ "1 negative ordered a large mango pineapple smoothie . sta... test\n",
+ "2 positive quite a surprise ! n nmy wife and i loved this... test\n",
+ "3 negative first i will say , this is a nice atmosphere a... test\n",
+ "4 positive i was overall pretty impressed by this hotel .... test"
]
},
"execution_count": 16,
@@ -508,7 +494,7 @@
"kernelspec": {
"display_name": "nlpbook",
"language": "python",
- "name": "nlpbook"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -520,7 +506,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.2"
+ "version": "3.12.7"
},
"toc": {
"colors": {
diff --git a/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb b/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb
index e3a9442..75761f4 100644
--- a/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb
+++ b/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb
@@ -2,32 +2,35 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
+ "# ToDo: Rerun the code when the missing data is available\n",
"import collections\n",
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"\n",
- "from argparse import Namespace"
+ "from argparse import Namespace\n",
+ "# Errata: Change data and model storage paths\n",
+ "from src.config import data_dir, model_dir"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"args = Namespace(\n",
- " raw_train_dataset_csv=\"data/yelp/raw_train.csv\",\n",
- " raw_test_dataset_csv=\"data/yelp/raw_test.csv\",\n",
+ " raw_train_dataset_csv=f\"{data_dir}/yelp/raw_train.csv\",\n",
+ " raw_test_dataset_csv=f\"{data_dir}/yelp/raw_test.csv\",\n",
" proportion_subset_of_train=0.1,\n",
" train_proportion=0.7,\n",
" val_proportion=0.15,\n",
" test_proportion=0.15,\n",
- " output_munged_csv=\"data/yelp/reviews_with_splits_lite.csv\",\n",
+ " output_munged_csv=f\"{data_dir}/yelp/reviews_with_splits_lite.csv\",\n",
" seed=1337\n",
")"
]
@@ -52,7 +55,7 @@
"by_rating = collections.defaultdict(list)\n",
"for _, row in train_reviews.iterrows():\n",
" by_rating[row.rating].append(row.to_dict())\n",
- " \n",
+ "\n",
"review_subset = []\n",
"\n",
"for _, item_list in sorted(by_rating.items()):\n",
@@ -90,47 +93,17 @@
"