PARQUET
In [1]:
#!pip install pandas pyarrow
In [2]:
# Writing a DataFrame to a Parquet File
import pandas as pd
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)
df.to_parquet('sample1.parquet', engine='pyarrow', index=False)
In [3]:
# Reading a Parquet File into a DataFrame
import pandas as pd
df = pd.read_parquet('sample1.parquet', engine='pyarrow')
print(df)
Name Age City 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago
In [4]:
# Writing a DataFrame to a Partitioned Parquet File
import pandas as pd
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'Los Angeles', 'Chicago', 'London']
}
df = pd.DataFrame(data)
df.to_parquet('sample2.parquet', engine='pyarrow', partition_cols=['City'], index=False,existing_data_behavior='delete_matching')
In [5]:
# Reading a Partitioned Parquet File
import pandas as pd
df = pd.read_parquet('sample2.parquet', engine='pyarrow')
print(df)
Name Age City 0 Charlie 35 Chicago 1 David 40 London 2 Bob 30 Los Angeles 3 Alice 25 New York
In [6]:
# Reading Multiple Parquet Files into a Single DataFrame
import pandas as pd
import glob
#parquet_files = glob.glob('path/to/parquet/files/*.parquet')
parquet_files = glob.glob('*.parquet')
df = pd.concat([pd.read_parquet(file, engine='pyarrow') for file in parquet_files])
print(df)
Name Age City 0 Alice 25 New York 1 Bob 30 Los Angeles 2 Charlie 35 Chicago 0 Charlie 35 Chicago 1 David 40 London 2 Bob 30 Los Angeles 3 Alice 25 New York
