Duplicate Rows
In [1]:
# Identifying and Removing Duplicate Rows
import pandas as pd
data = {
'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
'age': [25, 30, 35, 25, 30],
'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Los Angeles']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
duplicates = df.duplicated()
print("\nDuplicate Rows:")
print(duplicates)
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)
Original DataFrame:
name age city
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
3 Alice 25 New York
4 Bob 30 Los Angeles
Duplicate Rows:
0 False
1 False
2 False
3 True
4 True
dtype: bool
DataFrame after removing duplicates:
name age city
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
In [2]:
# Removing Duplicates Based on Specific Columns
import pandas as pd
data = {
'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
'age': [25, 30, 35, 25, 30],
'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Los Angeles']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_no_duplicates_name = df.drop_duplicates(subset=['name'])
print("\nDataFrame after removing duplicates based on 'name':")
print(df_no_duplicates_name)
Original DataFrame:
name age city
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
3 Alice 25 New York
4 Bob 30 Los Angeles
DataFrame after removing duplicates based on 'name':
name age city
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
In [3]:
# Keeping the Last Occurrence of Duplicate Rows
import pandas as pd
data = {
'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
'age': [25, 30, 35, 25, 30],
'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Los Angeles']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
df_no_duplicates_last = df.drop_duplicates(keep='last')
print("\nDataFrame after removing duplicates, keeping the last occurrence:")
print(df_no_duplicates_last)
Original DataFrame:
name age city
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
3 Alice 25 New York
4 Bob 30 Los Angeles
DataFrame after removing duplicates, keeping the last occurrence:
name age city
2 Charlie 35 Chicago
3 Alice 25 New York
4 Bob 30 Los Angeles
In [4]:
# Counting Duplicate Rows
import pandas as pd
data = {
'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
'age': [25, 30, 35, 25, 30],
'city': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Los Angeles']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
duplicate_count = df.duplicated().sum()
print("\nNumber of duplicate rows:")
print(duplicate_count)
Original DataFrame:
name age city
0 Alice 25 New York
1 Bob 30 Los Angeles
2 Charlie 35 Chicago
3 Alice 25 New York
4 Bob 30 Los Angeles
Number of duplicate rows:
2
