I have a dataset like this:
data = {'Price': [1, 4, 5, 100],
'Year': [20, 21, 19, 18],
'Mileage': [100, 1500, 1654, 2024],
'EngineV': [2, 3, 5, 4]}
I'm trying to remove outliers with this function:
def remove_outliers(data=data,columns=columns,n_std=3):
for col in columns:
mean = data[col].mean()
sd = data[col].std()
df = data[(data[col] <= mean+(n_std*sd))]
return data
df = pd.DataFrame(data.apply(remove_outliers))
...but I'm getting this error:
KeyError: 'Price'
I've tried specifying the data with these, but none resolve the error:
columns=data[['Price','Mileage','EngineV','Year']]
columns=("Price","Mileage","EngineV","Year")
columns=data.iloc[:,[0,1,2,3]]
Here's the full traceback:
KeyError Traceback (most recent call last)
File C:\Python3\lib\site-packages\pandas\core\indexes\base.py:3621, in Index.get_loc(self, key, method, tolerance)
3620 try:
-> 3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
File C:\Python3\lib\site-packages\pandas\_libs\index.pyx:136, in pandas._libs.index.IndexEngine.get_loc()
File C:\Python3\lib\site-packages\pandas\_libs\index.pyx:144, in pandas._libs.index.IndexEngine.get_loc()
File pandas\_libs\index_class_helper.pxi:41, in pandas._libs.index.Int64Engine._check_type()
KeyError: 'Price'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Input In [13], in <module>
8 df = data[(data[col] <= mean+(n_std*sd))]
10 return data
---> 11 df = pd.DataFrame(data.apply(remove_outliers))
12 print("New Shape :", df.shape)
File C:\Python3\lib\site-packages\pandas\core\frame.py:8839, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
8828 from pandas.core.apply import frame_apply
8830 op = frame_apply(
8831 self,
8832 func=func,
(...)
8837 kwargs=kwargs,
8838 )
-> 8839 return op.apply().__finalize__(self, method="apply")
File C:\Python3\lib\site-packages\pandas\core\apply.py:727, in FrameApply.apply(self)
724 elif self.raw:
725 return self.apply_raw()
--> 727 return self.apply_standard()
File C:\Python3\lib\site-packages\pandas\core\apply.py:851, in FrameApply.apply_standard(self)
850 def apply_standard(self):
--> 851 results, res_index = self.apply_series_generator()
853 # wrap results
854 return self.wrap_results(results, res_index)
File C:\Python3\lib\site-packages\pandas\core\apply.py:867, in FrameApply.apply_series_generator(self)
864 with option_context("mode.chained_assignment", None):
865 for i, v in enumerate(series_gen):
866 # ignore SettingWithCopy here in case the user mutates
--> 867 results[i] = self.f(v)
868 if isinstance(results[i], ABCSeries):
869 # If we have a view on v, we need to make a copy because
870 # series_generator will swap out the underlying data
871 results[i] = results[i].copy(deep=False)
Input In [13], in remove_outliers(data, columns, n_std)
3 def remove_outliers(data=data,columns=columns,n_std=3):
4 for col in columns:
----> 6 mean = data[col].mean()
7 sd = data[col].std()
8 df = data[(data[col] <= mean+(n_std*sd))]
File C:\Python3\lib\site-packages\pandas\core\series.py:958, in Series.__getitem__(self, key)
955 return self._values[key]
957 elif key_is_scalar:
--> 958 return self._get_value(key)
960 if is_hashable(key):
961 # Otherwise index.get_value will raise InvalidIndexError
962 try:
963 # For labels that don't resolve as scalars like tuples and frozensets
File C:\Python3\lib\site-packages\pandas\core\series.py:1069, in Series._get_value(self, label, takeable)
1066 return self._values[label]
1068 # Similar to Index.get_value, but we do not fall back to positional
-> 1069 loc = self.index.get_loc(label)
1070 return self.index._get_values_for_loc(self, loc, label)
File C:\Python3\lib\site-packages\pandas\core\indexes\base.py:3623, in Index.get_loc(self, key, method, tolerance)
3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
-> 3623 raise KeyError(key) from err
3624 except TypeError:
3625 # If we have a listlike key, _check_indexing_error will raise
3626 # InvalidIndexError. Otherwise we fall through and re-raise
3627 # the TypeError.
3628 self._check_indexing_error(key)
KeyError: 'Price'
My ultimate goal is to remove the rows with outliers from the whole dataset. Any help would be much appreciated!
datais adictwhere the key is the column name, there shouldn't be a need for setting them separately. And post the full traceback so that we see the failing line.