Error in PyPDF2 3.0.0 #478

NamanS-14 · 2023-12-31T15:14:11Z

While following every steps in the installation as given there is an error that I am unable to tackle and that is in some file PyPDF2 3.0.0 located in /usr/local/lib/python3.10/dist-packages/PyPDF2/_utils.py.
I am attaching the screenshots with the error that it is showing.
Please help me to execute the library.

viettran295 · 2024-01-01T20:52:23Z

I also got this error. I solved by installing from Github repo or you have to fix PyPDF2 problems in handlers.py

NamanS-14 · 2024-01-02T05:28:01Z

Could you please help me by letting me know that how may I solve in issue in a little bit more details.
Which file do I need to install from GitHub as you have mentioned above?

ayushtiwariji420 · 2024-02-16T07:38:35Z

I got the same error and updated the libraries code in file name handler.py
now it's working

-- coding: utf-8 --

import os
import sys
import PyPDF2
from PyPDF2 import PdfFileReader

from .core import TableList
from .parsers import Stream, Lattice
from .utils import (
TemporaryDirectory,
get_page_layout,
get_text_objects,
get_rotation,
is_url,
download_url,
)

class PDFHandler(object):
"""Handles all operations like temp directory creation, splitting
file into single page PDFs, parsing each PDF and then removing the
temp directory.

Parameters
----------
filepath : str
    Filepath or URL of the PDF file.
pages : str, optional (default: '1')
    Comma-separated page numbers.
    Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
    Password for decryption.

"""

def __init__(self, filepath, pages="1", password=None):
    if is_url(filepath):
        filepath = download_url(filepath)
    self.filepath = filepath
    if not filepath.lower().endswith(".pdf"):
        raise NotImplementedError("File format not supported")

    if password is None:
        self.password = ""
    else:
        self.password = password
        if sys.version_info[0] < 3:
            self.password = self.password.encode("ascii")
    self.pages = self._get_pages(self.filepath, pages)

def _get_pages(self, filepath, pages):
    """Converts pages string to list of ints.

    Parameters
    ----------
    filepath : str
        Filepath or URL of the PDF file.
    pages : str, optional (default: '1')
        Comma-separated page numbers.
        Example: '1,3,4' or '1,4-end' or 'all'.

    Returns
    -------
    P : list
        List of int page numbers.

    """
    page_numbers = []
    if pages == "1":
        page_numbers.append({"start": 1, "end": 1})
    else:
        instream = open(filepath, "rb")
        infile = PyPDF2.PdfReader(instream, strict=False)
        if infile.is_encrypted:
            infile.decrypt(self.password)
        if pages == "all":
            page_numbers.append({"start": 1, "end": len(infile.pages)})
        else:
            for r in pages.split(","):
                if "-" in r:
                    a, b = r.split("-")
                    if b == "end":
                        b = infile.getNumPages()
                    page_numbers.append({"start": int(a), "end": int(b)})
                else:
                    page_numbers.append({"start": int(r), "end": int(r)})
        instream.close()
    P = []
    for p in page_numbers:
        P.extend(range(p["start"], p["end"] + 1))
    return sorted(set(P))

def _save_page(self, filepath, page, temp):
    """Saves specified page from PDF into a temporary directory.

    Parameters
    ----------
    filepath : str
        Filepath or URL of the PDF file.
    page : int
        Page number.
    temp : str
        Tmp directory.

    """
    with open(filepath, "rb") as fileobj:
        infile = PyPDF2.PdfReader(fileobj, strict=False)
        if infile.is_encrypted:
            infile.decrypt(self.password)
        fpath = os.path.join(temp, f"page-{page}.pdf")
        froot, fext = os.path.splitext(fpath)
        p = infile.pages[page - 1]
        outfile = PyPDF2.PdfWriter()
        outfile.add_page(p)
        with open(fpath, "wb") as f:
            outfile.write(f)
        layout, dim = get_page_layout(fpath)
        # fix rotated PDF
        chars = get_text_objects(layout, ltype="char")
        horizontal_text = get_text_objects(layout, ltype="horizontal_text")
        vertical_text = get_text_objects(layout, ltype="vertical_text")
        rotation = get_rotation(chars, horizontal_text, vertical_text)
        if rotation != "":
            fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
            os.rename(fpath, fpath_new)
            instream = open(fpath_new, "rb")
            infile = PyPDF2.PdfReader(instream, strict=False)
            if infile.is_encrypted:
                infile.decrypt(self.password)
            outfile = PyPDF2.PdfWriter()
            p = infile.pages[0]
            if rotation == "anticlockwise":
                p.rotateClockwise(90)
            elif rotation == "clockwise":
                p.rotateCounterClockwise(90)
            outfile.add_page(p)
            with open(fpath, "wb") as f:
                outfile.write(f)
            instream.close()

def parse(
    self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
):
    """Extracts tables by calling parser.get_tables on all single
    page PDFs.

    Parameters
    ----------
    flavor : str (default: 'lattice')
        The parsing method to use ('lattice' or 'stream').
        Lattice is used by default.
    suppress_stdout : str (default: False)
        Suppress logs and warnings.
    layout_kwargs : dict, optional (default: {})
        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
    kwargs : dict
        See camelot.read_pdf kwargs.

    Returns
    -------
    tables : camelot.core.TableList
        List of tables found in PDF.

    """
    tables = []
    with TemporaryDirectory() as tempdir:
        for p in self.pages:
            self._save_page(self.filepath, p, tempdir)
        pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
        parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
        for p in pages:
            t = parser.extract_tables(
                p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
            )
            tables.extend(t)
    return TableList(sorted(tables))

Kev744 · 2024-04-03T10:47:31Z

Or simple install it :
!pip install PyPDF2==2.12.1
!pip install 'camelot-py[base]'

Make sure them two are been uninstalled before

sHermanGriffiths · 2024-07-10T19:39:06Z

They should really update the codebase to use pypdf instead, as it is the most recently released version.

bosd · 2024-08-06T12:55:39Z

Hey!

As #343, we try to build a maintained fork at pypdf_table_extraction.

Do you want to check that code and open an issue there if it still exsists?

phuang07 mentioned this issue Jan 17, 2024

Master #483

Open

blinklet mentioned this issue Aug 23, 2024

Import festival syllabus blinklet/music-festival-organizer#44

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Error in PyPDF2 3.0.0 #478

Error in PyPDF2 3.0.0 #478

NamanS-14 commented Dec 31, 2023

viettran295 commented Jan 1, 2024

NamanS-14 commented Jan 2, 2024

ayushtiwariji420 commented Feb 16, 2024

Kev744 commented Apr 3, 2024

sHermanGriffiths commented Jul 10, 2024

bosd commented Aug 6, 2024

Error in PyPDF2 3.0.0 #478

Error in PyPDF2 3.0.0 #478

Comments

NamanS-14 commented Dec 31, 2023

viettran295 commented Jan 1, 2024

NamanS-14 commented Jan 2, 2024

ayushtiwariji420 commented Feb 16, 2024

-- coding: utf-8 --

Kev744 commented Apr 3, 2024

sHermanGriffiths commented Jul 10, 2024

bosd commented Aug 6, 2024