diff --git a/camelot/cli.py b/camelot/cli.py
index e45664c1..5dad3dce 100644
--- a/camelot/cli.py
+++ b/camelot/cli.py
@@ -63,7 +63,7 @@ def set_config(self, key, value):
@click.option(
"-strip",
"--strip_text",
- help="Characters that should be stripped from a string before"
+ help="Substrings that should be stripped from a string before"
" assigning it to a cell.",
)
@click.option(
diff --git a/camelot/io.py b/camelot/io.py
index 78319bc9..ddc1d815 100644
--- a/camelot/io.py
+++ b/camelot/io.py
@@ -52,8 +52,8 @@ def read_pdf(
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds around flagged text.
- strip_text : str, optional (default: '')
- Characters that should be stripped from a string before
+ strip_text : List, optional (default: [])
+ Substrings that should be stripped from a string before
assigning it to a cell.
row_tol^ : int, optional (default: 2)
Tolerance parameter used to combine text vertically,
diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py
index 01d17d96..7ca4aae0 100644
--- a/camelot/parsers/lattice.py
+++ b/camelot/parsers/lattice.py
@@ -60,8 +60,8 @@ class Lattice(BaseParser):
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds around flagged text.
- strip_text : str, optional (default: '')
- Characters that should be stripped from a string before
+ strip_text : List, optional (default: [])
+ Substrings that should be stripped from a string before
assigning it to a cell.
line_tol : int, optional (default: 2)
Tolerance parameter used to merge close vertical and horizontal
@@ -98,7 +98,7 @@ def __init__(
shift_text=["l", "t"],
split_text=False,
flag_size=False,
- strip_text="",
+ strip_text=[],
line_tol=2,
joint_tol=2,
threshold_blocksize=15,
diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py
index 266a0e95..adab6105 100644
--- a/camelot/parsers/stream.py
+++ b/camelot/parsers/stream.py
@@ -42,8 +42,8 @@ class Stream(BaseParser):
flag_size : bool, optional (default: False)
Flag text based on font size. Useful to detect
super/subscripts. Adds around flagged text.
- strip_text : str, optional (default: '')
- Characters that should be stripped from a string before
+ strip_text : List, optional (default: [])
+ Substrings that should be stripped from a string before
assigning it to a cell.
edge_tol : int, optional (default: 50)
Tolerance parameter for extending textedges vertically.
@@ -63,7 +63,7 @@ def __init__(
columns=None,
split_text=False,
flag_size=False,
- strip_text="",
+ strip_text=[],
edge_tol=50,
row_tol=2,
column_tol=0,
diff --git a/camelot/utils.py b/camelot/utils.py
index 90ed400e..de137e32 100644
--- a/camelot/utils.py
+++ b/camelot/utils.py
@@ -484,14 +484,14 @@ def merge_close_lines(ar, line_tol=2):
return ret
-def text_strip(text, strip=""):
- """Strips any characters in `strip` that are present in `text`.
+def text_strip(text, strip=[]):
+ """Strips any substrings in `strip` that are present in `text`.
Parameters
----------
text : str
Text to process and strip.
- strip : str, optional (default: '')
- Characters that should be stripped from `text`.
+ strip : List, optional (default: [])
+ Substrings that should be stripped from `text`.
Returns
-------
stripped : str
@@ -499,9 +499,9 @@ def text_strip(text, strip=""):
if not strip:
return text
- stripped = re.sub(
- rf"[{''.join(map(re.escape, strip))}]", "", text, flags=re.UNICODE
- )
+ pattern = "|".join(map(re.escape, strip))
+
+ stripped = re.sub(pattern, "", text, flags=re.UNICODE)
return stripped
@@ -510,7 +510,7 @@ def text_strip(text, strip=""):
# (inspired from sklearn.pipeline.Pipeline)
-def flag_font_size(textline, direction, strip_text=""):
+def flag_font_size(textline, direction, strip_text=[]):
"""Flags super/subscripts in text by enclosing them with .
May give false positives.
@@ -520,8 +520,8 @@ def flag_font_size(textline, direction, strip_text=""):
List of PDFMiner LTChar objects.
direction : string
Direction of the PDFMiner LTTextLine object.
- strip_text : str, optional (default: '')
- Characters that should be stripped from a string before
+ strip_text : List, optional (default: [])
+ Substrings that should be stripped from a string before
assigning it to a cell.
Returns
@@ -562,7 +562,7 @@ def flag_font_size(textline, direction, strip_text=""):
return text_strip(fstring, strip_text)
-def split_textline(table, textline, direction, flag_size=False, strip_text=""):
+def split_textline(table, textline, direction, flag_size=False, strip_text=[]):
"""Splits PDFMiner LTTextLine into substrings if it spans across
multiple rows/columns.
@@ -577,8 +577,8 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
Whether or not to highlight a substring using
if its size is different from rest of the string. (Useful for
super and subscripts.)
- strip_text : str, optional (default: '')
- Characters that should be stripped from a string before
+ strip_text : List, optional (default: [])
+ Substrings that should be stripped from a string before
assigning it to a cell.
Returns
@@ -681,7 +681,7 @@ def split_textline(table, textline, direction, flag_size=False, strip_text=""):
def get_table_index(
- table, t, direction, split_text=False, flag_size=False, strip_text=""
+ table, t, direction, split_text=False, flag_size=False, strip_text=[]
):
"""Gets indices of the table cell where given text object lies by
comparing their y and x-coordinates.
@@ -700,8 +700,8 @@ def get_table_index(
Whether or not to highlight a substring using
if its size is different from rest of the string. (Useful for
super and subscripts)
- strip_text : str, optional (default: '')
- Characters that should be stripped from a string before
+ strip_text : List, optional (default: [])
+ Substrings that should be stripped from a string before
assigning it to a cell.
Returns
diff --git a/noxfile.py b/noxfile.py
index 0f02b6e1..750b16b2 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -171,7 +171,9 @@ def tests(session: Session) -> None:
"coverage[toml]", "pytest", "pygments", *base_requires, *plot_requires
)
try:
- session.run("coverage", "run", "--parallel", "-m", "pytest", *session.posargs)
+ session.run(
+ "coverage", "run", "--parallel", "-m", "pytest", "--pdb", *session.posargs
+ )
finally:
if session.interactive:
session.notify("coverage", posargs=[])
diff --git a/tests/test_stream.py b/tests/test_stream.py
index e86f23b7..5a2698d6 100644
--- a/tests/test_stream.py
+++ b/tests/test_stream.py
@@ -95,7 +95,11 @@ def test_stream_strip_text(testdir):
df = pd.DataFrame(data_stream_strip_text)
filename = os.path.join(testdir, "detect_vertical_false.pdf")
- tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
+ tables = camelot.read_pdf(filename, flavor="stream", strip_text=[" ", ",", "\n"])
+ import pdb
+
+ pdb.set_trace()
+
assert_frame_equal(df, tables[0].df)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9a68f386..dda1b866 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,11 +2,10 @@
import os
from pdfminer.converter import PDFPageAggregator
-from pdfminer.layout import (
- LAParams,
- LTTextBoxHorizontal
-)
-from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+from pdfminer.layout import LAParams
+from pdfminer.layout import LTTextBoxHorizontal
+from pdfminer.pdfinterp import PDFPageInterpreter
+from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage
from camelot.utils import bbox_intersection_area
@@ -16,7 +15,7 @@ def get_text_from_pdf(filename):
"Method to extract text object from pdf"
# https://stackoverflow.com/questions/22898145/how-to-extract-text-and-text-coordinates-from-a-pdf-file
# https://pdfminer-docs.readthedocs.io/programming.html#performing-layout-analysis
- document = open(filename, 'rb')
+ document = open(filename, "rb")
# Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.