diff options
author | 2024-02-15 16:59:06 -0800 | |
---|---|---|
committer | 2024-02-15 16:59:06 -0800 | |
commit | aa9c60b320a3a2e5679826b471f44934ef64fdae (patch) | |
tree | 2a23631b8ef506eabf271b5ae2d225ae7d3681e1 /CS105MiniProject.ipynb | |
parent | 5f6ec0476f262f14c4b134d44f294aeb8d603743 (diff) | |
download | CS105MiniProject-aa9c60b320a3a2e5679826b471f44934ef64fdae.tar.gz CS105MiniProject-aa9c60b320a3a2e5679826b471f44934ef64fdae.tar.zst CS105MiniProject-aa9c60b320a3a2e5679826b471f44934ef64fdae.zip |
Adds some preprocessing
Diffstat (limited to 'CS105MiniProject.ipynb')
-rw-r--r-- | CS105MiniProject.ipynb | 43 |
1 files changed, 34 insertions, 9 deletions
diff --git a/CS105MiniProject.ipynb b/CS105MiniProject.ipynb index d19a096..dda76e0 100644 --- a/CS105MiniProject.ipynb +++ b/CS105MiniProject.ipynb @@ -12,15 +12,15 @@ "height": 614 }, "ExecuteTime": { - "end_time": "2024-02-16T00:42:12.437369Z", - "start_time": "2024-02-16T00:42:11.755307Z" + "end_time": "2024-02-16T00:58:32.054806Z", + "start_time": "2024-02-16T00:58:31.371211Z" } }, "outputs": [ { "data": { - "text/plain": " Timestamp What gender do you identify as? \\\n0 2/9/2024 20:12:14 Male \n1 2/9/2024 20:16:34 Female \n2 2/9/2024 20:18:55 Female \n3 2/9/2024 20:24:00 Male \n4 2/9/2024 20:26:16 Male \n.. ... ... \n255 2/14/2024 19:46:28 Male \n256 2/15/2024 0:28:38 Male \n257 2/15/2024 8:33:45 Male \n258 2/15/2024 16:10:40 Female \n259 2/15/2024 16:14:11 Female \n\n Who do you live with? \\\n0 Neither \n1 Both \n2 Friends \n3 Neither \n4 Neither \n.. ... \n255 Friends \n256 Family \n257 Family \n258 Family \n259 Friends \n\n Do you currently live in a house, apartnment, or dorm? \\\n0 House \n1 Apartment \n2 House \n3 Apartment \n4 Apartment \n.. ... \n255 House \n256 Apartment \n257 House \n258 Apartment \n259 Dorm \n\n How many people live in your household? Do you currently work? \\\n0 6 Yes \n1 4 No \n2 4 No \n3 1 No \n4 1 Yes \n.. ... ... \n255 5 Yes \n256 North District 4 bed 2 bath No \n257 9 No \n258 4 Yes \n259 3 (room), 8 (hall), ~70 (building) No \n\n How many hours do you work per week on average? \\\n0 5 - 10 \n1 NaN \n2 NaN \n3 NaN \n4 10 - 20 \n.. ... \n255 10 - 20 \n256 NaN \n257 1 - 5 \n258 5 - 10 \n259 NaN \n\n Do you work on or off campus? \\\n0 Off-campus \n1 NaN \n2 NaN \n3 NaN \n4 Off-campus \n.. ... \n255 On-campus \n256 NaN \n257 Off-campus \n258 On-campus \n259 NaN \n\n Do you work in a department related to your major? \\\n0 No \n1 NaN \n2 NaN \n3 No \n4 Yes \n.. ... \n255 No \n256 NaN \n257 No \n258 No \n259 NaN \n\n Do you have roommates that are part of your major? \n0 No \n1 Yes \n2 No \n3 No \n4 No \n.. ... \n255 No \n256 No \n257 No \n258 No \n259 Yes \n\n[260 rows x 10 columns]", - "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Timestamp</th>\n <th>What gender do you identify as?</th>\n <th>Who do you live with?</th>\n <th>Do you currently live in a house, apartnment, or dorm?</th>\n <th>How many people live in your household?</th>\n <th>Do you currently work?</th>\n <th>How many hours do you work per week on average?</th>\n <th>Do you work on or off campus?</th>\n <th>Do you work in a department related to your major?</th>\n <th>Do you have roommates that are part of your major?</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>2/9/2024 20:12:14</td>\n <td>Male</td>\n <td>Neither</td>\n <td>House</td>\n <td>6</td>\n <td>Yes</td>\n <td>5 - 10</td>\n <td>Off-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2/9/2024 20:16:34</td>\n <td>Female</td>\n <td>Both</td>\n <td>Apartment</td>\n <td>4</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2/9/2024 20:18:55</td>\n <td>Female</td>\n <td>Friends</td>\n <td>House</td>\n <td>4</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n </tr>\n <tr>\n <th>3</th>\n <td>2/9/2024 20:24:00</td>\n <td>Male</td>\n <td>Neither</td>\n <td>Apartment</td>\n <td>1</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>4</th>\n <td>2/9/2024 20:26:16</td>\n <td>Male</td>\n <td>Neither</td>\n <td>Apartment</td>\n <td>1</td>\n <td>Yes</td>\n <td>10 - 20</td>\n <td>Off-campus</td>\n <td>Yes</td>\n <td>No</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>255</th>\n <td>2/14/2024 19:46:28</td>\n <td>Male</td>\n <td>Friends</td>\n <td>House</td>\n <td>5</td>\n <td>Yes</td>\n <td>10 - 20</td>\n <td>On-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>256</th>\n <td>2/15/2024 0:28:38</td>\n <td>Male</td>\n <td>Family</td>\n <td>Apartment</td>\n <td>North District 4 bed 2 bath</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n </tr>\n <tr>\n <th>257</th>\n <td>2/15/2024 8:33:45</td>\n <td>Male</td>\n <td>Family</td>\n <td>House</td>\n <td>9</td>\n <td>No</td>\n <td>1 - 5</td>\n <td>Off-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>258</th>\n <td>2/15/2024 16:10:40</td>\n <td>Female</td>\n <td>Family</td>\n <td>Apartment</td>\n <td>4</td>\n <td>Yes</td>\n <td>5 - 10</td>\n <td>On-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>259</th>\n <td>2/15/2024 16:14:11</td>\n <td>Female</td>\n <td>Friends</td>\n <td>Dorm</td>\n <td>3 (room), 8 (hall), ~70 (building)</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yes</td>\n </tr>\n </tbody>\n</table>\n<p>260 rows × 10 columns</p>\n</div>" + "text/plain": " Timestamp What is your current class standing? \\\n0 2/9/2024 20:12:14 Senior \n1 2/9/2024 20:16:34 Junior \n2 2/9/2024 20:18:55 Junior \n3 2/9/2024 20:24:00 Senior \n4 2/9/2024 20:26:16 Graduate \n.. ... ... \n255 2/14/2024 19:46:28 Junior \n256 2/15/2024 0:28:38 NaN \n257 2/15/2024 8:33:45 Senior \n258 2/15/2024 16:10:40 Sophomore \n259 2/15/2024 16:14:11 Sophomore \n\n What gender do you identify as? Who do you live with? \\\n0 Male Neither \n1 Female Both \n2 Female Friends \n3 Male Neither \n4 Male Neither \n.. ... ... \n255 Male Friends \n256 Male Family \n257 Male Family \n258 Female Family \n259 Female Friends \n\n Do you currently live in a house, apartnment, or dorm? \\\n0 House \n1 Apartment \n2 House \n3 Apartment \n4 Apartment \n.. ... \n255 House \n256 Apartment \n257 House \n258 Apartment \n259 Dorm \n\n How many people live in your household? Do you currently work? \\\n0 6 Yes \n1 4 No \n2 4 No \n3 1 No \n4 1 Yes \n.. ... ... \n255 5 Yes \n256 North District 4 bed 2 bath No \n257 9 No \n258 4 Yes \n259 3 (room), 8 (hall), ~70 (building) No \n\n How many hours do you work per week on average? \\\n0 5 - 10 \n1 NaN \n2 NaN \n3 NaN \n4 10 - 20 \n.. ... \n255 10 - 20 \n256 NaN \n257 1 - 5 \n258 5 - 10 \n259 NaN \n\n Do you work on or off campus? \\\n0 Off-campus \n1 NaN \n2 NaN \n3 NaN \n4 Off-campus \n.. ... \n255 On-campus \n256 NaN \n257 Off-campus \n258 On-campus \n259 NaN \n\n Do you work in a department related to your major? \\\n0 No \n1 NaN \n2 NaN \n3 No \n4 Yes \n.. ... \n255 No \n256 NaN \n257 No \n258 No \n259 NaN \n\n Do you have roommates that are part of your major? \n0 No \n1 Yes \n2 No \n3 No \n4 No \n.. ... \n255 No \n256 No \n257 No \n258 No \n259 Yes \n\n[260 rows x 11 columns]", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Timestamp</th>\n <th>What is your current class standing?</th>\n <th>What gender do you identify as?</th>\n <th>Who do you live with?</th>\n <th>Do you currently live in a house, apartnment, or dorm?</th>\n <th>How many people live in your household?</th>\n <th>Do you currently work?</th>\n <th>How many hours do you work per week on average?</th>\n <th>Do you work on or off campus?</th>\n <th>Do you work in a department related to your major?</th>\n <th>Do you have roommates that are part of your major?</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>2/9/2024 20:12:14</td>\n <td>Senior</td>\n <td>Male</td>\n <td>Neither</td>\n <td>House</td>\n <td>6</td>\n <td>Yes</td>\n <td>5 - 10</td>\n <td>Off-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2/9/2024 20:16:34</td>\n <td>Junior</td>\n <td>Female</td>\n <td>Both</td>\n <td>Apartment</td>\n <td>4</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yes</td>\n </tr>\n <tr>\n <th>2</th>\n <td>2/9/2024 20:18:55</td>\n <td>Junior</td>\n <td>Female</td>\n <td>Friends</td>\n <td>House</td>\n <td>4</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n </tr>\n <tr>\n <th>3</th>\n <td>2/9/2024 20:24:00</td>\n <td>Senior</td>\n <td>Male</td>\n <td>Neither</td>\n <td>Apartment</td>\n <td>1</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>4</th>\n <td>2/9/2024 20:26:16</td>\n <td>Graduate</td>\n <td>Male</td>\n <td>Neither</td>\n <td>Apartment</td>\n <td>1</td>\n <td>Yes</td>\n <td>10 - 20</td>\n <td>Off-campus</td>\n <td>Yes</td>\n <td>No</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>255</th>\n <td>2/14/2024 19:46:28</td>\n <td>Junior</td>\n <td>Male</td>\n <td>Friends</td>\n <td>House</td>\n <td>5</td>\n <td>Yes</td>\n <td>10 - 20</td>\n <td>On-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>256</th>\n <td>2/15/2024 0:28:38</td>\n <td>NaN</td>\n <td>Male</td>\n <td>Family</td>\n <td>Apartment</td>\n <td>North District 4 bed 2 bath</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>No</td>\n </tr>\n <tr>\n <th>257</th>\n <td>2/15/2024 8:33:45</td>\n <td>Senior</td>\n <td>Male</td>\n <td>Family</td>\n <td>House</td>\n <td>9</td>\n <td>No</td>\n <td>1 - 5</td>\n <td>Off-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>258</th>\n <td>2/15/2024 16:10:40</td>\n <td>Sophomore</td>\n <td>Female</td>\n <td>Family</td>\n <td>Apartment</td>\n <td>4</td>\n <td>Yes</td>\n <td>5 - 10</td>\n <td>On-campus</td>\n <td>No</td>\n <td>No</td>\n </tr>\n <tr>\n <th>259</th>\n <td>2/15/2024 16:14:11</td>\n <td>Sophomore</td>\n <td>Female</td>\n <td>Friends</td>\n <td>Dorm</td>\n <td>3 (room), 8 (hall), ~70 (building)</td>\n <td>No</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>Yes</td>\n </tr>\n </tbody>\n</table>\n<p>260 rows × 11 columns</p>\n</div>" }, "execution_count": 1, "metadata": {}, @@ -33,23 +33,48 @@ "import numpy as np\n", "\n", "df = pd.read_csv(\"data.csv\")\n", - "df = df.iloc[:, [0, 5, 7, 8, 9, 58, 59, 60, 61, 26]]\n", + "df = df.iloc[:, [0, 2, 5, 7, 8, 9, 58, 59, 60, 61, 26]]\n", "df" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "29889175", "metadata": { "id": "29889175", "ExecuteTime": { - "end_time": "2024-02-16T00:42:12.440256Z", - "start_time": "2024-02-16T00:42:12.438587Z" + "end_time": "2024-02-16T00:58:32.059881Z", + "start_time": "2024-02-16T00:58:32.056070Z" } }, "outputs": [], - "source": [] + "source": [ + "df['How many people live in your household?'] = (df['How many people live in your household?']\n", + " .fillna(0)\n", + " .replace('4 in total', '4')\n", + " .replace('4 (Including me)', '4')\n", + " .replace('at school 4 including me ', '4')\n", + " .replace('3 excluding me', '3')\n", + " .replace('5 including me', '5')\n", + " .replace('North District 4 bed 2 bath', '4')\n", + " .replace('3 (room), 8 (hall), ~70 (building)', '3')\n", + " .astype(int))" + ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-02-16T00:58:32.062613Z", + "start_time": "2024-02-16T00:58:32.061050Z" + } + }, + "id": "de4448fd64205d85", + "execution_count": 2 } ], "metadata": { |