PyPDF2
tags :
Python Apps #
https://github.com/py-pdf/pypdf uses pytest for testing.
#

Reading PDF/A-3 format of PDF #
import PyPDF2
def getAttachments(reader):
"""
Retrieves the file attachments of the PDF as a dictionary of file names
and the file data as a bytestring.
:return: dictionary of filenames and bytestrings
cpa """
attachments =
#First, get those that are pdf attachments
catalog = reader.trailer["/Root"]
if "/EmbeddedFiles" in catalog["/Names"]:
fileNames = catalog['/Names']['/EmbeddedFiles']['/Names']
for f in fileNames:
if isinstance(f, str):
name = f
dataIndex = fileNames.index(f) + 1
fDict = fileNames[dataIndex].get_object()
fData = fDict['/EF']['/F'].get_data()
attachments[name] = fData
#Next, go through all pages and all annotations to those pages
#to find any attached files
for page_object in reader.pages:
if "/Annots" in page_object:
for annot in page_object['/Annots']:
annotobj = annot.get_object()
if annotobj['/Subtype'] == '/FileAttachment':
fileobj = annotobj["/FS"]
attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data()
return attachments
handler = open(filename, 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
for fName, fData in dictionary.items():
with open(fName, 'wb') as outfile:
outfile.write(fData)