- While searching for ideas for portfolio projects I found Alex Freberg's Data Analyst Portfolio Porject Series, which I decided to follow.
- Cleaning a Nashville Real Estate dataset on SQL.
- Data Cleaning = Microsoft SQL Server.
- SQL Code = Click Here
-
After downloading the dataset from here we import the .xlsx file into SQL Server.
-- 1. Viewing the data
SELECT *
FROM Portfolio_Project..NashvilleHousing
-- 2. Standardising Date Format
-- We want to change the date from DATETIME to DATE
ALTER TABLE Portfolio_Project.dbo.NashvilleHousing
ALTER COLUMN SaleDate DATE
-- 3. Populating Property Address data
-- Check for NULLS
SELECT PropertyAddress
FROM Portfolio_Project..NashvilleHousing
WHERE PropertyAddress IS NULL
/* In the dataset there are various properties that have the same ParcelID but
differ in UniqueIDs, when that happens there are times when only one of the
properties has the PropertyAddress populated */
-- SELF JOIN to check NULLS for properties with same ParcelID and different UniqueID
SELECT
a.ParcelID,
a.PropertyAddress,
b.ParcelID,
b.PropertyAddress
FROM Portfolio_Project..NashvilleHousing AS a
JOIN Portfolio_Project..NashvilleHousing AS b
ON a.ParcelID = b.ParcelID
AND a.UniqueID <> b.UniqueID
WHERE a.PropertyAddress IS NULL
-- Populating a.PropertyAddress with b.PropertyAddress
UPDATE a
SET a.PropertyAddress = b.PropertyAddress
FROM Portfolio_Project..NashvilleHousing AS a
JOIN Portfolio_Project..NashvilleHousing AS b
ON a.ParcelID = b.ParcelID
AND a.UniqueID <> b.UniqueID
WHERE a.PropertyAddress IS NULL
-- 4. Breaking out PropertyAddress into Individual Columns (Address, City)
-- Check to see how the PropertyAddress is separated
-- It is separated with a ','
SELECT PropertyAddress
FROM Portfolio_Project..NashvilleHousing
-- Using SUBSTRING and CHARINDEX
-- To select the Address (everything before the ',') and the City (everything after the ',')
SELECT
SUBSTRING(TRIM(PropertyAddress), 1, CHARINDEX(',', PropertyAddress)-1) AS Address,
SUBSTRING(TRIM(PropertyAddress), CHARINDEX(',', PropertyAddress)+2, LEN(PropertyAddress)) AS City
FROM Portfolio_Project..NashvilleHousing
-- Adding & updating the columns to the table
ALTER TABLE Portfolio_Project..NashvilleHousing
ADD
PropertySplitAddress NVARCHAR(255),
PropertySplitCity NVARCHAR(255)
UPDATE Portfolio_Project..NashvilleHousing
SET
PropertySplitAddress = SUBSTRING(TRIM(PropertyAddress), 1, CHARINDEX(',', PropertyAddress) -1),
PropertySplitCity = SUBSTRING(TRIM(PropertyAddress), CHARINDEX(',', PropertyAddress) +2, LEN(PropertyAddress))
-- 5. Breaking out OwnerAddress into Individual Columns (Address, City, State)
-- Check to see how the OwnerAddress is separated
SELECT OwnerAddress
FROM Portfolio_Project..NashvilleHousing
-- Using PARSENAME
-- This allows us to get different parts of a string, when they are delimited by a '.'
-- So we have to replace the ','
-- It starts backwards, so a 1 will extract the ending of the string
SELECT
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3) AS OwnerSplitAddress,
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2) AS OwnerSplitCity,
PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1) AS OwnerSplitState
FROM Portfolio_Project..NashvilleHousing
-- Adding & updating the columns to the table
ALTER TABLE Portfolio_Project..NashvilleHousing
ADD
OwnerSplitAddress NVARCHAR(255),
OwnerSplitCity NVARCHAR(255),
OwnerSplitState NVARCHAR(255)
UPDATE portfolio_project..nashvillehousing
SET
ownersplitaddress = Parsename(Replace(owneraddress, ',', '.'), 3),
ownersplitcity = Parsename(Replace(owneraddress, ',', '.'), 2),
ownersplitstate = Parsename(Replace(owneraddress, ',', '.'), 1)
-- 6. Changing Y and N to Yes and No in SoldAsVacant field
-- Check to see how many options there are and which are the most populated
SELECT
DISTINCT(SoldAsVacant),
COUNT(SoldAsVacant) AS Count
FROM Portfolio_Project..NashvilleHousing
GROUP BY SoldAsVacant
ORDER BY Count
-- Updating column by changing all SoldAsVacant to Yes and No as they are the most populated options
UPDATE portfolio_project..NashvilleHousing
SET soldasvacant = CASE
WHEN soldasvacant LIKE 'Y' THEN 'Yes'
WHEN soldasvacant LIKE 'N' THEN 'No'
ELSE soldasvacant
END
-- 7. Removing Duplicates
-- Using a CTE + ROW_NUMBER to find duplicates
WITH RowNumCTE AS
(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY ParcelID, PropertyAddress, SaleDate, SalePrice, LegalReference ORDER BY ParcelID) AS RowNumber
FROM Portfolio_Project..NashvilleHousing
)
SELECT *
FROM RowNumCTE
WHERE RowNumber > 1
-- there are 104 duplicates
-- Deleting the duplicates from the CTE
WITH RowNumCTE AS
(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY ParcelID, PropertyAddress, SaleDate, SalePrice, LegalReference ORDER BY ParcelID) AS RowNumber
FROM Portfolio_Project..NashvilleHousing
)
DELETE
FROM RowNumCTE
WHERE RowNumber > 1
-- 8. Deleting Unused Columns (Like PropertyAddress & OwnerAddress as we have the splits)
ALTER TABLE Portfolio_Project..NashvilleHousing
DROP COLUMN PropertyAddress, OwnerAddress