import pandas as pd

# Manually Creating a DataFrame
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35]
}
df = pd.DataFrame(data)
print(df)

data = {
    'PatientID': [1, 2, 3],
    'Name': ['John Doe', 'Jane Smith', 'Emily Davis'],
    'Age': [45, 38, 50],
    'Diagnosis': ['Hypertension', 'Diabetes', 'Asthma']
}

df_manual = pd.DataFrame(data)
print(df_manual)

# Assuming 'healthcare_data.csv' is your file
df_csv = pd.read_csv('healthcare_data.csv')
print(df_csv.head())

# Assuming 'healthcare_data.parquet' is your file
df_parquet = pd.read_parquet('healthcare_data.parquet')
print(df_parquet.head())

df_xls = pd.read_excel('healthcare_data.xls')
print(df_xls.head())

df_xlsx = pd.read_excel('healthcare_data.xlsx')
print(df_xlsx.head())

import pandas as pd
import gzip

with gzip.open('healthcare_data.csv.gz', 'rt') as f:
    df_gz = pd.read_csv(f)
print(df_gz.head())

!pip install zipfile36

import pandas as pd
import zipfile

with zipfile.ZipFile('healthcare_data.zip') as z:
    with z.open('healthcare_data.csv') as f:
        df_zip = pd.read_csv(f)
print(df_zip.head())

import pandas as pd
import zipfile
import io

# Path to your ZIP file
zip_file_path = 'path/to/your/file.zip'

# Create a ZipFile object
with zipfile.ZipFile(zip_file_path, 'r') as z:
    # List all files in the ZIP
    file_list = z.namelist()
    
    # Read each CSV file into a DataFrame and concatenate them
    df_list = []
    for file_name in file_list:
        if file_name.endswith('.csv'):
            with z.open(file_name) as f:
                df = pd.read_csv(f)
                df_list.append(df)
    
    # Concatenate all DataFrames
    combined_df = pd.concat(df_list, ignore_index=True)

print(combined_df.head())

import pandas as pd
import tarfile

with tarfile.open('healthcare_data.tar.gz', 'r:gz') as tar:
    for member in tar.getmembers():
        if member.isfile():
            f = tar.extractfile(member)
            df_tar = pd.read_csv(f)
print(df_tar.head())

import pandas as pd
import tarfile

# Path to your TAR file
tar_file_path = 'path/to/your/file.tar'

# Create a TarFile object
with tarfile.open(tar_file_path, 'r') as t:
    # List all files in the TAR
    file_list = t.getnames()
    
    # Read each CSV file into a DataFrame and concatenate them
    df_list = []
    for file_name in file_list:
        if file_name.endswith('.csv'):
            f = t.extractfile(file_name)
            df = pd.read_csv(f)
            df_list.append(df)
    
    # Concatenate all DataFrames
    combined_df = pd.concat(df_list, ignore_index=True)

print(combined_df.head())

import pandas as pd
import glob

# Specify the path to the directory containing the CSV files
path = 'path/to/your/csv/files/'

# Use glob to get all the CSV files in the directory
all_files = glob.glob(path + "*.csv")

# Create an empty list to hold the DataFrames
dfs = []

# Loop through the list of files and read each one into a DataFrame
for file in all_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all the DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Display the combined DataFrame
print(combined_df)

!pip install google-cloud-storage

import pandas as pd
from google.cloud import storage

# Replace with your actual credentials and bucket/file details
credentials_json = {
    "type": "service_account",
    "project_id": "your-project-id",
    "private_key_id": "your-private-key-id",
    "private_key": "your-private-key",
    "client_email": "your-client-email",
    "client_id": "your-client-id",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_x509_cert_url": "your-client-x509-cert-url"
}
# Authenticate
client = storage.Client.from_service_account_info(credentials_json)
bucket = client.get_bucket('your-bucket-name')
blob = bucket.blob('your-file.csv')
# Download the file as a string
data = blob.download_as_string().decode('utf-8')
# Create DataFrame
df = pd.read_csv(pd.compat.StringIO(data))

print(df.head())

from google.cloud import storage
import pandas as pd

# Replace with your actual credentials and bucket details
credentials_json = {
    "type": "service_account",
    "project_id": "your-project-id",
    "private_key_id": "your-private-key-id",
    "private_key": "your-private-key",
    "client_email": "your-client-email",
    "client_id": "your-client-id",
    "auth_uri": "https://accounts.google.com/o/oauth2/auth",
    "token_uri": "https://oauth2.googleapis.com/token",
    "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
    "client_x509_cert_url": "your-client-x509-cert-url"
}

client = storage.Client.from_service_account_info(credentials_json)
bucket = client.get_bucket('your-bucket-name')

# List all files in the bucket
blobs = bucket.list_blobs(prefix='your-folder/')

# Read multiple CSV files into DataFrames
dfs = []
for blob in blobs:
    if blob.name.endswith('.csv'):
        data = blob.download_as_string().decode('utf-8')
        df = pd.read_csv(pd.compat.StringIO(data))
        dfs.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)
print(combined_df.head())

!pip install boto3

import pandas as pd
import boto3
from io import StringIO

# Replace with your actual credentials and bucket/file details
aws_access_key_id = 'YOUR_ACCESS_KEY'
aws_secret_access_key = 'YOUR_SECRET_KEY'
bucket_name = 'your-bucket-name'
file_key = 'your-file.csv'

# Authenticate and fetch the file from S3
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
data = obj['Body'].read().decode('utf-8')

# Create DataFrame
df = pd.read_csv(StringIO(data))

print(df.head())

import boto3
import pandas as pd
from io import StringIO

# Replace with your actual credentials and bucket details
aws_access_key_id = 'YOUR_ACCESS_KEY'
aws_secret_access_key = 'YOUR_SECRET_KEY'
bucket_name = 'your-bucket-name'
folder_prefix = 'your-folder/'

s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

# List all files in the bucket
response = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_prefix)
files = [content['Key'] for content in response.get('Contents', []) if content['Key'].endswith('.csv')]

# Read multiple CSV files into DataFrames
dfs = []
for file_key in files:
    obj = s3.get_object(Bucket=bucket_name, Key=file_key)
    data = obj['Body'].read().decode('utf-8')
    df = pd.read_csv(StringIO(data))
    dfs.append(df)

# Combine all DataFrames into one
combined_df = pd.concat(dfs, ignore_index=True)

creating

Creating

CSV¶

Parquet¶

XLS¶

XLSX¶

GZ¶

ZIP¶

TAR¶

Reading multiple CSV files in a directory¶

Google Cloud Storage¶

AWS S3¶

www.rd112.com