PyPDF2

PyPDF2

July 8, 2024 | seedling, permanent

tags :

Python Apps #

https://github.com/py-pdf/pypdf uses pytest for testing.

<2023-09-03 Sun> #

Reading PDF/A-3 format of PDF #

stackoverflow


import PyPDF2

def getAttachments(reader):
    """
    Retrieves the file attachments of the PDF as a dictionary of file names
    and the file data as a bytestring.

    :return: dictionary of filenames and bytestrings
 cpa   """
    attachments =
    #First, get those that are pdf attachments
    catalog = reader.trailer["/Root"]
    if "/EmbeddedFiles" in catalog["/Names"]:
        fileNames = catalog['/Names']['/EmbeddedFiles']['/Names']
        for f in fileNames:
            if isinstance(f, str):
                name = f
                dataIndex = fileNames.index(f) + 1
                fDict = fileNames[dataIndex].get_object()
                fData = fDict['/EF']['/F'].get_data()
                attachments[name] = fData

    #Next, go through all pages and all annotations to those pages
    #to find any attached files
    for page_object in reader.pages:
        if "/Annots" in page_object:
            for annot in page_object['/Annots']:
                annotobj = annot.get_object()
                if annotobj['/Subtype'] == '/FileAttachment':
                    fileobj = annotobj["/FS"]
                    attachments[fileobj["/F"]] = fileobj["/EF"]["/F"].get_data()
    return attachments



handler = open(filename, 'rb')
reader = PyPDF2.PdfFileReader(handler)
dictionary = getAttachments(reader)
for fName, fData in dictionary.items():
    with open(fName, 'wb') as outfile:
        outfile.write(fData)


No notes link to this note