DOC/DOCX
In [ ]:
!pip install python-docx
!pip install textract
In [ ]:
# Reading a .docx File into a DataFrame
import pandas as pd
from docx import Document
def read_docx_to_dataframe(file_path):
doc = Document(file_path)
data = {'Text': [para.text for para in doc.paragraphs]}
df = pd.DataFrame(data)
return df
df = read_docx_to_dataframe('sample.docx')
print(df)
In [ ]:
# Writing a DataFrame to a .docx File
import pandas as pd
from docx import Document
# Sample DataFrame
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)
def write_dataframe_to_docx(df, file_path):
doc = Document()
table = doc.add_table(rows=1, cols=len(df.columns))
hdr_cells = table.rows[0].cells
for i, column in enumerate(df.columns):
hdr_cells[i].text = column
for index, row in df.iterrows():
row_cells = table.add_row().cells
for i, value in enumerate(row):
row_cells[i].text = str(value)
doc.save(file_path)
write_dataframe_to_docx(df, 'output.docx')
In [ ]:
# Reading a .doc File into a DataFrame
import pandas as pd
import textract
def read_doc_to_dataframe(file_path):
text = textract.process(file_path).decode('utf-8')
data = {'Text': text.split('\n')}
df = pd.DataFrame(data)
return df
df = read_doc_to_dataframe('sample.doc')
print(df)
In [ ]:
# Writing a DataFrame to a .doc File
import pandas as pd
from docx import Document
import subprocess
def write_dataframe_to_docx(df, file_path):
doc = Document()
table = doc.add_table(rows=1, cols=len(df.columns))
hdr_cells = table.rows[0].cells
for i, column in enumerate(df.columns):
hdr_cells[i].text = column
for index, row in df.iterrows():
row_cells = table.add_row().cells
for i, value in enumerate(row):
row_cells[i].text = str(value)
doc.save(file_path)
def convert_docx_to_doc(docx_path, doc_path):
subprocess.run(['libreoffice', '--headless', '--convert-to', 'doc', docx_path, '--outdir', '.'])
subprocess.run(['mv', docx_path.replace('.docx', '.doc'), doc_path])
data = {
'Name': ['Alice', 'Bob', 'Charlie'],
'Age': [25, 30, 35],
'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)
write_dataframe_to_docx(df, 'output.docx')
convert_docx_to_doc('output.docx', 'output.doc')
In [ ]:
# importing multiple documents docx into a dataframe
import os
import pandas as pd
from docx import Document
def read_docx(file_path):
doc = Document(file_path)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
return '\n'.join(full_text)
folder_path = 'path/to/your/folder'
documents = []
for filename in os.listdir(folder_path):
if filename.endswith('.docx'):
file_path = os.path.join(folder_path, filename)
doc_text = read_docx(file_path)
documents.append({'filename': filename, 'text': doc_text})
df = pd.DataFrame(documents)
print(df)
