Skip to content

EnyaCrocock/Nashville_Housing_Data_Cleaning

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

8 Commits
 
 
 
 

Repository files navigation

SQL Data Cleaning Project

💡 Inspiration


✍🏼 The Objective

  • Cleaning a Nashville Real Estate dataset on SQL.

📈 The Dataset

💻 Tools Used

  • Data Cleaning = Microsoft SQL Server.

🔗 Links


SQL Data Cleaning

Downloading and Importing Data

  • After downloading the dataset from here we import the .xlsx file into SQL Server.

    image

SQL Server Queries

-- 1. Viewing the data 
  
SELECT *
FROM Portfolio_Project..NashvilleHousing

image


-- 2. Standardising Date Format
--    We want to change the date from DATETIME to DATE

ALTER TABLE Portfolio_Project.dbo.NashvilleHousing
ALTER COLUMN SaleDate DATE

image image


-- 3. Populating Property Address data

-- Check for NULLS

SELECT PropertyAddress
FROM Portfolio_Project..NashvilleHousing
WHERE PropertyAddress IS NULL

/* In the dataset there are various properties that have the same ParcelID but 
 differ in UniqueIDs, when that happens there are times when only one of the 
 properties has the PropertyAddress populated */

-- SELF JOIN to check NULLS for properties with same ParcelID and different UniqueID

SELECT 
  a.ParcelID,
  a.PropertyAddress,
  b.ParcelID,
  b.PropertyAddress
FROM Portfolio_Project..NashvilleHousing AS a
  JOIN Portfolio_Project..NashvilleHousing AS b
    ON a.ParcelID = b.ParcelID
    AND a.UniqueID <> b.UniqueID 
WHERE a.PropertyAddress IS NULL

image

-- Populating a.PropertyAddress with b.PropertyAddress

UPDATE a
SET a.PropertyAddress = b.PropertyAddress
FROM Portfolio_Project..NashvilleHousing AS a
  JOIN Portfolio_Project..NashvilleHousing AS b
    ON a.ParcelID = b.ParcelID
    AND a.UniqueID  <> b.UniqueID 
WHERE a.PropertyAddress IS NULL

-- 4. Breaking out PropertyAddress into Individual Columns (Address, City)

-- Check to see how the PropertyAddress is separated 
-- It is separated with a ','

SELECT PropertyAddress
FROM Portfolio_Project..NashvilleHousing

-- Using SUBSTRING and CHARINDEX 
-- To select the Address (everything before the ',') and the City (everything after the ',') 

SELECT 
  SUBSTRING(TRIM(PropertyAddress), 1, CHARINDEX(',', PropertyAddress)-1) AS Address,
  SUBSTRING(TRIM(PropertyAddress), CHARINDEX(',', PropertyAddress)+2, LEN(PropertyAddress)) AS City
FROM Portfolio_Project..NashvilleHousing

image

-- Adding & updating the columns to the table

ALTER TABLE Portfolio_Project..NashvilleHousing
ADD 
  PropertySplitAddress NVARCHAR(255),
  PropertySplitCity NVARCHAR(255)

UPDATE Portfolio_Project..NashvilleHousing	
SET 
  PropertySplitAddress = SUBSTRING(TRIM(PropertyAddress), 1, CHARINDEX(',', PropertyAddress) -1),
  PropertySplitCity = SUBSTRING(TRIM(PropertyAddress), CHARINDEX(',', PropertyAddress) +2, LEN(PropertyAddress))

image


-- 5. Breaking out OwnerAddress into Individual Columns (Address, City, State)

-- Check to see how the OwnerAddress is separated 

SELECT OwnerAddress
FROM Portfolio_Project..NashvilleHousing

-- Using PARSENAME
-- This allows us to get different parts of a string, when they are delimited by a '.'
-- So we have to replace the ','
-- It starts backwards, so a 1 will extract the ending of the string 

SELECT 
  PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3) AS OwnerSplitAddress,
  PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2) AS OwnerSplitCity,
  PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1) AS OwnerSplitState
FROM Portfolio_Project..NashvilleHousing

-- Adding & updating the columns to the table

ALTER TABLE Portfolio_Project..NashvilleHousing
ADD 
  OwnerSplitAddress NVARCHAR(255),
  OwnerSplitCity NVARCHAR(255),
  OwnerSplitState NVARCHAR(255)

UPDATE portfolio_project..nashvillehousing
SET
  ownersplitaddress = Parsename(Replace(owneraddress, ',', '.'), 3),
  ownersplitcity = Parsename(Replace(owneraddress, ',', '.'), 2),
  ownersplitstate = Parsename(Replace(owneraddress, ',', '.'), 1)  

image


-- 6. Changing Y and N to Yes and No in SoldAsVacant field

-- Check to see how many options there are and which are the most populated

SELECT 
  DISTINCT(SoldAsVacant),
  COUNT(SoldAsVacant) AS Count
FROM Portfolio_Project..NashvilleHousing
GROUP BY SoldAsVacant
ORDER BY Count

image

-- Updating column by changing all SoldAsVacant to Yes and No as they are the most populated options

UPDATE portfolio_project..NashvilleHousing
SET    soldasvacant = CASE
                        WHEN soldasvacant LIKE 'Y' THEN 'Yes'
                        WHEN soldasvacant LIKE 'N' THEN 'No'
                        ELSE soldasvacant
                      END  

image


-- 7. Removing Duplicates

-- Using a CTE + ROW_NUMBER to find duplicates

WITH RowNumCTE AS
(
  SELECT *,
    ROW_NUMBER() OVER(PARTITION BY ParcelID, PropertyAddress, SaleDate, SalePrice, LegalReference ORDER BY ParcelID) AS RowNumber
  FROM Portfolio_Project..NashvilleHousing
)
SELECT *
FROM RowNumCTE
WHERE RowNumber > 1

-- there are 104 duplicates

image

-- Deleting the duplicates from the CTE

WITH RowNumCTE AS
(
  SELECT *,
    ROW_NUMBER() OVER(PARTITION BY ParcelID, PropertyAddress, SaleDate, SalePrice, LegalReference ORDER BY ParcelID) AS RowNumber
  FROM Portfolio_Project..NashvilleHousing
)
DELETE
FROM RowNumCTE
WHERE RowNumber > 1

-- 8. Deleting Unused Columns (Like PropertyAddress & OwnerAddress as we have the splits)

ALTER TABLE Portfolio_Project..NashvilleHousing
DROP COLUMN PropertyAddress, OwnerAddress

About

Cleaning a Nashville Housing Dataset on SQL.

Topics

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published