Creating
In [ ]:
import pandas as pd
In [ ]:
# Manually Creating a DataFrame
data = {
"Name": ["Alice", "Bob", "Charlie"],
"Age": [25, 30, 35]
}
df = pd.DataFrame(data)
print(df)
In [ ]:
data = {
'PatientID': [1, 2, 3],
'Name': ['John Doe', 'Jane Smith', 'Emily Davis'],
'Age': [45, 38, 50],
'Diagnosis': ['Hypertension', 'Diabetes', 'Asthma']
}
df_manual = pd.DataFrame(data)
print(df_manual)
CSV¶
In [ ]:
# Assuming 'healthcare_data.csv' is your file
df_csv = pd.read_csv('healthcare_data.csv')
print(df_csv.head())
Parquet¶
In [ ]:
# Assuming 'healthcare_data.parquet' is your file
df_parquet = pd.read_parquet('healthcare_data.parquet')
print(df_parquet.head())
XLS¶
In [ ]:
df_xls = pd.read_excel('healthcare_data.xls')
print(df_xls.head())
XLSX¶
In [ ]:
df_xlsx = pd.read_excel('healthcare_data.xlsx')
print(df_xlsx.head())
GZ¶
In [ ]:
import pandas as pd
import gzip
with gzip.open('healthcare_data.csv.gz', 'rt') as f:
df_gz = pd.read_csv(f)
print(df_gz.head())
ZIP¶
In [ ]:
!pip install zipfile36
In [ ]:
import pandas as pd
import zipfile
with zipfile.ZipFile('healthcare_data.zip') as z:
with z.open('healthcare_data.csv') as f:
df_zip = pd.read_csv(f)
print(df_zip.head())
In [ ]:
import pandas as pd
import zipfile
import io
# Path to your ZIP file
zip_file_path = 'path/to/your/file.zip'
# Create a ZipFile object
with zipfile.ZipFile(zip_file_path, 'r') as z:
# List all files in the ZIP
file_list = z.namelist()
# Read each CSV file into a DataFrame and concatenate them
df_list = []
for file_name in file_list:
if file_name.endswith('.csv'):
with z.open(file_name) as f:
df = pd.read_csv(f)
df_list.append(df)
# Concatenate all DataFrames
combined_df = pd.concat(df_list, ignore_index=True)
print(combined_df.head())
TAR¶
In [ ]:
import pandas as pd
import tarfile
with tarfile.open('healthcare_data.tar.gz', 'r:gz') as tar:
for member in tar.getmembers():
if member.isfile():
f = tar.extractfile(member)
df_tar = pd.read_csv(f)
print(df_tar.head())
In [ ]:
import pandas as pd
import tarfile
# Path to your TAR file
tar_file_path = 'path/to/your/file.tar'
# Create a TarFile object
with tarfile.open(tar_file_path, 'r') as t:
# List all files in the TAR
file_list = t.getnames()
# Read each CSV file into a DataFrame and concatenate them
df_list = []
for file_name in file_list:
if file_name.endswith('.csv'):
f = t.extractfile(file_name)
df = pd.read_csv(f)
df_list.append(df)
# Concatenate all DataFrames
combined_df = pd.concat(df_list, ignore_index=True)
print(combined_df.head())
Reading multiple CSV files in a directory¶
In [ ]:
import pandas as pd
import glob
# Specify the path to the directory containing the CSV files
path = 'path/to/your/csv/files/'
# Use glob to get all the CSV files in the directory
all_files = glob.glob(path + "*.csv")
# Create an empty list to hold the DataFrames
dfs = []
# Loop through the list of files and read each one into a DataFrame
for file in all_files:
df = pd.read_csv(file)
dfs.append(df)
# Concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)
# Display the combined DataFrame
print(combined_df)
Google Cloud Storage¶
In [ ]:
!pip install google-cloud-storage
In [ ]:
import pandas as pd
from google.cloud import storage
# Replace with your actual credentials and bucket/file details
credentials_json = {
"type": "service_account",
"project_id": "your-project-id",
"private_key_id": "your-private-key-id",
"private_key": "your-private-key",
"client_email": "your-client-email",
"client_id": "your-client-id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "your-client-x509-cert-url"
}
# Authenticate
client = storage.Client.from_service_account_info(credentials_json)
bucket = client.get_bucket('your-bucket-name')
blob = bucket.blob('your-file.csv')
# Download the file as a string
data = blob.download_as_string().decode('utf-8')
# Create DataFrame
df = pd.read_csv(pd.compat.StringIO(data))
print(df.head())
In [ ]:
from google.cloud import storage
import pandas as pd
# Replace with your actual credentials and bucket details
credentials_json = {
"type": "service_account",
"project_id": "your-project-id",
"private_key_id": "your-private-key-id",
"private_key": "your-private-key",
"client_email": "your-client-email",
"client_id": "your-client-id",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://oauth2.googleapis.com/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "your-client-x509-cert-url"
}
client = storage.Client.from_service_account_info(credentials_json)
bucket = client.get_bucket('your-bucket-name')
# List all files in the bucket
blobs = bucket.list_blobs(prefix='your-folder/')
# Read multiple CSV files into DataFrames
dfs = []
for blob in blobs:
if blob.name.endswith('.csv'):
data = blob.download_as_string().decode('utf-8')
df = pd.read_csv(pd.compat.StringIO(data))
dfs.append(df)
# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)
print(combined_df.head())
AWS S3¶
In [ ]:
!pip install boto3
In [ ]:
import pandas as pd
import boto3
from io import StringIO
# Replace with your actual credentials and bucket/file details
aws_access_key_id = 'YOUR_ACCESS_KEY'
aws_secret_access_key = 'YOUR_SECRET_KEY'
bucket_name = 'your-bucket-name'
file_key = 'your-file.csv'
# Authenticate and fetch the file from S3
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
data = obj['Body'].read().decode('utf-8')
# Create DataFrame
df = pd.read_csv(StringIO(data))
print(df.head())
In [ ]:
import boto3
import pandas as pd
from io import StringIO
# Replace with your actual credentials and bucket details
aws_access_key_id = 'YOUR_ACCESS_KEY'
aws_secret_access_key = 'YOUR_SECRET_KEY'
bucket_name = 'your-bucket-name'
folder_prefix = 'your-folder/'
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
# List all files in the bucket
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
files = [content['Key'] for content in response.get('Contents', []) if content['Key'].endswith('.csv')]
# Read multiple CSV files into DataFrames
dfs = []
for file_key in files:
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
data = obj['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(data))
dfs.append(df)
# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)
