From 10ca114a2dcecc9fea6b3a9bd72c8ea3ab9d5331 Mon Sep 17 00:00:00 2001 From: Arif Rasim Date: Fri, 19 Jan 2024 18:49:17 +0200 Subject: [PATCH] make strip accept substrings in a list --- camelot/cli.py | 2 +- camelot/io.py | 4 ++-- camelot/parsers/lattice.py | 6 +++--- camelot/parsers/stream.py | 6 +++--- camelot/utils.py | 32 ++++++++++++++++---------------- noxfile.py | 4 +++- tests/test_stream.py | 6 +++++- tests/test_utils.py | 11 +++++------ 8 files changed, 38 insertions(+), 33 deletions(-) diff --git a/camelot/cli.py b/camelot/cli.py index e45664c1..5dad3dce 100644 --- a/camelot/cli.py +++ b/camelot/cli.py @@ -63,7 +63,7 @@ def set_config(self, key, value): @click.option( "-strip", "--strip_text", - help="Characters that should be stripped from a string before" + help="Substrings that should be stripped from a string before" " assigning it to a cell.", ) @click.option( diff --git a/camelot/io.py b/camelot/io.py index 78319bc9..ddc1d815 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -52,8 +52,8 @@ def read_pdf( flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. - strip_text : str, optional (default: '') - Characters that should be stripped from a string before + strip_text : List, optional (default: []) + Substrings that should be stripped from a string before assigning it to a cell. row_tol^ : int, optional (default: 2) Tolerance parameter used to combine text vertically, diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index 01d17d96..7ca4aae0 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -60,8 +60,8 @@ class Lattice(BaseParser): flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. - strip_text : str, optional (default: '') - Characters that should be stripped from a string before + strip_text : List, optional (default: []) + Substrings that should be stripped from a string before assigning it to a cell. line_tol : int, optional (default: 2) Tolerance parameter used to merge close vertical and horizontal @@ -98,7 +98,7 @@ def __init__( shift_text=["l", "t"], split_text=False, flag_size=False, - strip_text="", + strip_text=[], line_tol=2, joint_tol=2, threshold_blocksize=15, diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 266a0e95..adab6105 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -42,8 +42,8 @@ class Stream(BaseParser): flag_size : bool, optional (default: False) Flag text based on font size. Useful to detect super/subscripts. Adds around flagged text. - strip_text : str, optional (default: '') - Characters that should be stripped from a string before + strip_text : List, optional (default: []) + Substrings that should be stripped from a string before assigning it to a cell. edge_tol : int, optional (default: 50) Tolerance parameter for extending textedges vertically. @@ -63,7 +63,7 @@ def __init__( columns=None, split_text=False, flag_size=False, - strip_text="", + strip_text=[], edge_tol=50, row_tol=2, column_tol=0, diff --git a/camelot/utils.py b/camelot/utils.py index 90ed400e..de137e32 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -484,14 +484,14 @@ def merge_close_lines(ar, line_tol=2): return ret -def text_strip(text, strip=""): - """Strips any characters in `strip` that are present in `text`. +def text_strip(text, strip=[]): + """Strips any substrings in `strip` that are present in `text`. Parameters ---------- text : str Text to process and strip. - strip : str, optional (default: '') - Characters that should be stripped from `text`. + strip : List, optional (default: []) + Substrings that should be stripped from `text`. Returns ------- stripped : str @@ -499,9 +499,9 @@ def text_strip(text, strip=""): if not strip: return text - stripped = re.sub( - rf"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE - ) + pattern = "|".join(map(re.escape, strip)) + + stripped = re.sub(pattern, "", text, flags=re.UNICODE) return stripped @@ -510,7 +510,7 @@ def text_strip(text, strip=""): # (inspired from sklearn.pipeline.Pipeline) -def flag_font_size(textline, direction, strip_text=""): +def flag_font_size(textline, direction, strip_text=[]): """Flags super/subscripts in text by enclosing them with . May give false positives. @@ -520,8 +520,8 @@ def flag_font_size(textline, direction, strip_text=""): List of PDFMiner LTChar objects. direction : string Direction of the PDFMiner LTTextLine object. - strip_text : str, optional (default: '') - Characters that should be stripped from a string before + strip_text : List, optional (default: []) + Substrings that should be stripped from a string before assigning it to a cell. Returns @@ -562,7 +562,7 @@ def flag_font_size(textline, direction, strip_text=""): return text_strip(fstring, strip_text) -def split_textline(table, textline, direction, flag_size=False, strip_text=""): +def split_textline(table, textline, direction, flag_size=False, strip_text=[]): """Splits PDFMiner LTTextLine into substrings if it spans across multiple rows/columns. @@ -577,8 +577,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): Whether or not to highlight a substring using if its size is different from rest of the string. (Useful for super and subscripts.) - strip_text : str, optional (default: '') - Characters that should be stripped from a string before + strip_text : List, optional (default: []) + Substrings that should be stripped from a string before assigning it to a cell. Returns @@ -681,7 +681,7 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""): def get_table_index( - table, t, direction, split_text=False, flag_size=False, strip_text="" + table, t, direction, split_text=False, flag_size=False, strip_text=[] ): """Gets indices of the table cell where given text object lies by comparing their y and x-coordinates. @@ -700,8 +700,8 @@ def get_table_index( Whether or not to highlight a substring using if its size is different from rest of the string. (Useful for super and subscripts) - strip_text : str, optional (default: '') - Characters that should be stripped from a string before + strip_text : List, optional (default: []) + Substrings that should be stripped from a string before assigning it to a cell. Returns diff --git a/noxfile.py b/noxfile.py index 0f02b6e1..750b16b2 100644 --- a/noxfile.py +++ b/noxfile.py @@ -171,7 +171,9 @@ def tests(session: Session) -> None: "coverage[toml]", "pytest", "pygments", *base_requires, *plot_requires ) try: - session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs) + session.run( + "coverage", "run", "--parallel", "-m", "pytest", "--pdb", *session.posargs + ) finally: if session.interactive: session.notify("coverage", posargs=[]) diff --git a/tests/test_stream.py b/tests/test_stream.py index e86f23b7..5a2698d6 100644 --- a/tests/test_stream.py +++ b/tests/test_stream.py @@ -95,7 +95,11 @@ def test_stream_strip_text(testdir): df = pd.DataFrame(data_stream_strip_text) filename = os.path.join(testdir, "detect_vertical_false.pdf") - tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n") + tables = camelot.read_pdf(filename, flavor="stream", strip_text=[" ", ",", "\n"]) + import pdb + + pdb.set_trace() + assert_frame_equal(df, tables[0].df) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9a68f386..dda1b866 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,11 +2,10 @@ import os from pdfminer.converter import PDFPageAggregator -from pdfminer.layout import ( - LAParams, - LTTextBoxHorizontal -) -from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager +from pdfminer.layout import LAParams +from pdfminer.layout import LTTextBoxHorizontal +from pdfminer.pdfinterp import PDFPageInterpreter +from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage from camelot.utils import bbox_intersection_area @@ -16,7 +15,7 @@ def get_text_from_pdf(filename): "Method to extract text object from pdf" # https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file # https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis - document = open(filename, 'rb') + document = open(filename, "rb") # Create resource manager rsrcmgr = PDFResourceManager() # Set parameters for analysis.