In [ ]:
!pip install pandas pymupdf
!pip install reportlab
In [ ]:
# importing multiple documents pdf into a dataframe
import os
import pandas as pd
import fitz # PyMuPDF
def read_pdf(file_path):
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
return text
folder_path = 'path/to/your/folder'
documents = []
for filename in os.listdir(folder_path):
if filename.endswith('.pdf'):
file_path = os.path.join(folder_path, filename)
doc_text = read_pdf(file_path)
documents.append({'filename': filename, 'text': doc_text})
df = pd.DataFrame(documents)
print(df)
In [ ]:
# Writing DataFrame to PDF Files
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
def write_text_to_pdf(text, file_path):
c = canvas.Canvas(file_path, pagesize=letter)
width, height = letter
c.drawString(72, height - 72, text)
c.save()
for index, row in df.iterrows():
output_file = f"output_{row['filename']}_page_{row['page_number']}.pdf"
write_text_to_pdf(row['text'], output_file)
In [ ]:
# Writing multiple pages to pdf document
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
# Sample DataFrame
data = {
'filename': ['doc1', 'doc1', 'doc2', 'doc2'],
'page_number': [1, 2, 1, 2],
'text': [
'This is the text of page 1 of doc1.',
'This is the text of page 2 of doc1.',
'This is the text of page 1 of doc2.',
'This is the text of page 2 of doc2.'
]
}
df = pd.DataFrame(data)
def write_dataframe_to_pdf(df, output_file):
c = canvas.Canvas(output_file, pagesize=letter)
width, height = letter
for index, row in df.iterrows():
c.drawString(72, height - 72, f"Filename: {row['filename']}")
c.drawString(72, height - 90, f"Page Number: {row['page_number']}")
text = row['text']
lines = text.split('\n')
y = height - 120
for line in lines:
c.drawString(72, y, line)
y -= 15
c.showPage()
c.save()
write_dataframe_to_pdf(df, 'output.pdf')
In [ ]:
!pip install pdfkit
!pip install reportlab
In [ ]:
# Converting HTML to PDF
import pdfkit
# HTML content
html_content = """
Sample HTML
Hello, World!
This is a sample HTML to PDF conversion.
"""
# Convert HTML string to PDF
pdfkit.from_string(html_content, 'output.pdf')
In [ ]:
# Converting an HTML File to PDF
import pdfkit
# Path to the HTML file
html_file = 'sample.html'
# Convert HTML file to PDF
pdfkit.from_file(html_file, 'output.pdf')
In [ ]:
# Converting a URL to PDF
import pdfkit
# URL of the web page
url = 'https://www.example.com'
# Convert URL to PDF
pdfkit.from_url(url, 'output.pdf')
In [ ]:
# more options
import pdfkit
html_content = """
Sample HTML
Hello, World!
This is a sample HTML to PDF conversion.
"""
options = {
'page-size': 'A4',
'orientation': 'Portrait',
'margin-top': '10mm',
'margin-right': '10mm',
'margin-bottom': '10mm',
'margin-left': '10mm'
}
pdfkit.from_string(html_content, 'output.pdf', options=options)
In [ ]:
# Adding Images to a PDF
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
def create_pdf_with_images(output_file):
c = canvas.Canvas(output_file, pagesize=letter)
width, height = letter
# Add some text
c.drawString(72, height - 72, "Hello, World!")
c.drawString(72, height - 90, "This is a sample PDF with images.")
# Add an image
image_path = 'path/to/your/image.jpg'
image = ImageReader(image_path)
c.drawImage(image, 72, height - 300, width=200, height=200)
# Add another image
another_image_path = 'path/to/another/image.png'
another_image = ImageReader(another_image_path)
c.drawImage(another_image, 300, height - 300, width=200, height=200)
c.save()
create_pdf_with_images('output_with_images.pdf')
In [ ]:
# Adding Images to Each Page from a DataFrame
import pandas as pd
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.utils import ImageReader
data = {
'text': ['Page 1 text', 'Page 2 text', 'Page 3 text'],
'image_path': ['path/to/image1.jpg', 'path/to/image2.jpg', 'path/to/image3.jpg']
}
df = pd.DataFrame(data)
def create_pdf_from_dataframe(df, output_file):
c = canvas.Canvas(output_file, pagesize=letter)
width, height = letter
for index, row in df.iterrows():
# Add text
c.drawString(72, height - 72, row['text'])
# Add image
image = ImageReader(row['image_path'])
c.drawImage(image, 72, height - 300, width=200, height=200)
# Create a new page
c.showPage()
c.save()
create_pdf_from_dataframe(df, 'output_from_dataframe.pdf')
In [ ]:
# HTML content with image links to pdf
import pdfkit
html_content = """
Sample HTML with Images
Welcome to My PDF
This is a sample HTML file with images.
"""
with open('sample.html', 'w') as file:
file.write(html_content)
pdfkit.from_file('sample.html', 'output.pdf')
