\n",
" \n",
" 0 | \n",
- " negative | \n",
- " the entrance was the impressive thing about th... | \n",
- " train | \n",
+ " None | \n",
+ " sans serif html background fff color padding p... | \n",
+ " NaN | \n",
"
\n",
" \n",
" 1 | \n",
" negative | \n",
- " i m a mclover , and i had no problem nwith the... | \n",
- " train | \n",
+ " ordered a large mango pineapple smoothie . sta... | \n",
+ " test | \n",
"
\n",
" \n",
" 2 | \n",
- " negative | \n",
- " less than good here , not terrible , but i see... | \n",
- " train | \n",
+ " positive | \n",
+ " quite a surprise ! n nmy wife and i loved this... | \n",
+ " test | \n",
"
\n",
" \n",
" 3 | \n",
" negative | \n",
- " i don t know if i can ever bring myself to go ... | \n",
- " train | \n",
+ " first i will say , this is a nice atmosphere a... | \n",
+ " test | \n",
"
\n",
" \n",
" 4 | \n",
- " negative | \n",
- " food was ok good but the service was terrible ... | \n",
- " train | \n",
+ " positive | \n",
+ " i was overall pretty impressed by this hotel .... | \n",
+ " test | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " rating review split\n",
- "0 negative the entrance was the impressive thing about th... train\n",
- "1 negative i m a mclover , and i had no problem nwith the... train\n",
- "2 negative less than good here , not terrible , but i see... train\n",
- "3 negative i don t know if i can ever bring myself to go ... train\n",
- "4 negative food was ok good but the service was terrible ... train"
+ " rating review split\n",
+ "0 None sans serif html background fff color padding p... NaN\n",
+ "1 negative ordered a large mango pineapple smoothie . sta... test\n",
+ "2 positive quite a surprise ! n nmy wife and i loved this... test\n",
+ "3 negative first i will say , this is a nice atmosphere a... test\n",
+ "4 positive i was overall pretty impressed by this hotel .... test"
]
},
"execution_count": 16,
@@ -508,7 +491,7 @@
"kernelspec": {
"display_name": "nlpbook",
"language": "python",
- "name": "nlpbook"
+ "name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -520,7 +503,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.2"
+ "version": "3.12.7"
},
"toc": {
"colors": {
diff --git a/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb b/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb
index e3a9442..0dbb9bc 100644
--- a/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb
+++ b/chapters/chapter_3/3_5_yelp_dataset_preprocessing_LITE.ipynb
@@ -2,10 +2,11 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
+ "# ToDo: Rerun the code when the missing data is available\n",
"import collections\n",
"import numpy as np\n",
"import pandas as pd\n",
@@ -52,7 +53,7 @@
"by_rating = collections.defaultdict(list)\n",
"for _, row in train_reviews.iterrows():\n",
" by_rating[row.rating].append(row.to_dict())\n",
- " \n",
+ "\n",
"review_subset = []\n",
"\n",
"for _, item_list in sorted(by_rating.items()):\n",
@@ -90,47 +91,17 @@
"