From cc3eb0d21585010adf0ff5534e44ffbad7646202 Mon Sep 17 00:00:00 2001 From: Gautier Solard <104368859+gsolard@users.noreply.github.com> Date: Thu, 12 Oct 2023 14:37:06 +0200 Subject: [PATCH] Fixes several bugs in words_n_fun tests (#26) * Fixes several bugs in words_n_fun tests * Forgot to save requirements.txt for spacy version change --- requirements.txt | 2 +- setup.py | 2 +- tests/test_1_utils.py | 10 +++++----- tests/test_3_api.py | 4 ++-- tests/test_4_vectorizationTokenization.py | 10 +--------- tests/test_5_synonym_malefemale_replacement.py | 9 --------- tests/test_6_lemmatizer.py | 9 --------- tests/test_7_stopwords.py | 9 --------- .../preprocessing/vectorization_tokenization.py | 1 - 9 files changed, 10 insertions(+), 46 deletions(-) diff --git a/requirements.txt b/requirements.txt index fb6bf02..4572491 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,6 +24,6 @@ coverage==6.4.4 # Has to be installed last / optionnal to use spacy lemmatizer markupsafe==2.0.1 # BUG FIX -> https://github.com/aws/aws-sam-cli/issues/3661 Cython==0.29.24 -spacy==3.3.1 +spacy==3.3.3 # The following line downloads a french spacy model. It can be commented if you don't have an internet access to download it, but lemmatizer features won't work. https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0-py3-none-any.whl diff --git a/setup.py b/setup.py index 16316f0..b3d282f 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ 'requests>=2.23,<2.29', ], extras_require={ - "lemmatizer": ["spacy==3.3.1", "markupsafe==2.0.1", "Cython==0.29.24", "fr-core-news-sm==3.3.0"] + "lemmatizer": ["spacy==3.3.3", "markupsafe==2.0.1", "Cython==0.29.24", "fr-core-news-sm==3.3.0"] } # pip install words_n_fun || pip install words_n_fun[lemmatizer] ) diff --git a/tests/test_1_utils.py b/tests/test_1_utils.py index 95848ea..bb348e3 100644 --- a/tests/test_1_utils.py +++ b/tests/test_1_utils.py @@ -94,7 +94,7 @@ def test_data_agnostic(self): '''Testing function utils.data_agnostic''' # Definition d'une fonction à décorer def test_function(docs): - if type(docs) != pd.Series: raise TypeError('') + if not isinstance(docs, pd.Series): raise TypeError('') return docs.apply(lambda x: 'test') # Vals à tester test_str = "ceci est un test" @@ -159,7 +159,7 @@ def test_data_agnostic_input(self): '''Testing function utils.data_agnostic_input''' # Definition d'une fonction à décorer def test_function(docs): - if type(docs) != pd.Series: raise TypeError('') + if not isinstance(docs, pd.Series): raise TypeError('') return docs.apply(lambda x: 'test') # Vals à tester test_str = "ceci est un test" @@ -503,7 +503,7 @@ def test_regroup_data_series(self): '''Testing function utils.regroup_data_series''' # Definition d'une fonction à décorer def test_function(docs): - if type(docs) != pd.Series: raise TypeError('') + if not isinstance(docs, pd.Series): raise TypeError('') return docs.apply(lambda x: x if x in ['avant', 'milieu', 'après'] else 'test') # Vals à tester docs_test = pd.Series(['avant'] + ["ceci est un test"] * 5000 + ['milieu'] + ["ceci est un test"] * 5000 + ['après'], name='test') @@ -534,11 +534,11 @@ def test_regroup_data_df(self): '''Testing function utils.regroup_data_df''' # Definition d'une fonction à wrapper def test_function_1(df): - if type(df) != pd.DataFrame: raise TypeError('') + if not isinstance(df, pd.DataFrame): raise TypeError('') df['test1'] = df['test1'].str.replace('toto', 'titi', regex=False) return df def test_function_2(df): - if type(df) != pd.DataFrame: raise TypeError('') + if not isinstance(df, pd.DataFrame): raise TypeError('') df['test3'] = df['test2'].str.replace('toto', 'tata', regex=False) return df # Vals à tester diff --git a/tests/test_3_api.py b/tests/test_3_api.py index 3bd93a2..e8739f4 100644 --- a/tests/test_3_api.py +++ b/tests/test_3_api.py @@ -70,11 +70,11 @@ def test_get_preprocessor(self): def test_preprocessor_pipeline_setter(self): preprocessor = api.get_preprocessor(pipeline=api.DEFAULT_PIPELINE) # test getter - self.assertEquals(preprocessor.pipeline, api.DEFAULT_PIPELINE) + self.assertEqual(preprocessor.pipeline, api.DEFAULT_PIPELINE) # modify pipeline alt_pipeline = api.DEFAULT_PIPELINE[:-2] preprocessor.pipeline = alt_pipeline - self.assertEquals(preprocessor.pipeline, alt_pipeline) + self.assertEqual(preprocessor.pipeline, alt_pipeline) def test_preprocess_pipeline(self): '''Testing function api.preprocess_pipeline''' diff --git a/tests/test_4_vectorizationTokenization.py b/tests/test_4_vectorizationTokenization.py index fd29cc9..bc7e12c 100644 --- a/tests/test_4_vectorizationTokenization.py +++ b/tests/test_4_vectorizationTokenization.py @@ -23,10 +23,8 @@ # Utils libs import os -import importlib import numpy as np import pandas as pd -from words_n_fun import utils from words_n_fun.preprocessing import vectorization_tokenization # Disable logging @@ -34,16 +32,10 @@ logging.disable(logging.CRITICAL) + class VectorizationTokenizationTests(unittest.TestCase): '''Main class to test all functions in vectorization_tokenization.py.''' - # Mock du decorateur DataAgnostic (on le bypass pour les tests) - default_decorator = lambda f: f - utils.data_agnostic = default_decorator - utils.data_agnostic_input = default_decorator - # Reload de la librairie vectorization_tokenization (pour application du decorateur par defaut) - importlib.reload(vectorization_tokenization) - def setUp(self): '''SetUp fonction''' diff --git a/tests/test_5_synonym_malefemale_replacement.py b/tests/test_5_synonym_malefemale_replacement.py index 23ed3c5..a710aa2 100644 --- a/tests/test_5_synonym_malefemale_replacement.py +++ b/tests/test_5_synonym_malefemale_replacement.py @@ -23,10 +23,8 @@ # Utils libs import os -import importlib import numpy as np import pandas as pd -from words_n_fun import utils from words_n_fun.preprocessing import synonym_malefemale_replacement # Disable logging @@ -37,13 +35,6 @@ class SynonymTests(unittest.TestCase): '''Main class to test all functions in synonym_malefemale_replacement.py''' - # Mock du decorateur DataAgnostic (on le bypass pour les tests) - default_decorator = lambda f: f - utils.data_agnostic = default_decorator - utils.data_agnostic_input = default_decorator - # Reload de la librairie basic (pour application du decorateur par defaut) - importlib.reload(synonym_malefemale_replacement) - def setUp(self): '''SetUp fonction''' diff --git a/tests/test_6_lemmatizer.py b/tests/test_6_lemmatizer.py index ec97f43..98fb874 100644 --- a/tests/test_6_lemmatizer.py +++ b/tests/test_6_lemmatizer.py @@ -27,10 +27,8 @@ import spacy except ModuleNotFoundError: raise unittest.SkipTest("Skipping all lemmatizer tests as spacy can't be imported.") -import importlib import numpy as np import pandas as pd -from words_n_fun import utils from words_n_fun.preprocessing import lemmatizer # Disable logging @@ -41,13 +39,6 @@ class LemmatizerTests(unittest.TestCase): '''Main class to test all functions in lemmatizer.py.''' - # Mock du decorateur DataAgnostic (on le bypass pour les tests) - default_decorator = lambda f: f - utils.data_agnostic = default_decorator - utils.data_agnostic_input = default_decorator - # Reload de la librairie lemmatizer (pour application du decorateur par defaut) - importlib.reload(lemmatizer) - def setUp(self): '''SetUp fonction''' diff --git a/tests/test_7_stopwords.py b/tests/test_7_stopwords.py index 3eec321..390ecd7 100644 --- a/tests/test_7_stopwords.py +++ b/tests/test_7_stopwords.py @@ -23,10 +23,8 @@ # Utils libs import os -import importlib import numpy as np import pandas as pd -from words_n_fun import utils from words_n_fun.preprocessing import stopwords # Disable logging @@ -37,13 +35,6 @@ class StopwordsTests(unittest.TestCase): '''Main class to test all functions in stopwords.py.''' - # Mock du decorateur DataAgnostic (on le bypass pour les tests) - default_decorator = lambda f: f - utils.data_agnostic = default_decorator - utils.data_agnostic_input = default_decorator - # Reload de la librairie stopwords (pour application du decorateur par defaut) - importlib.reload(stopwords) - def setUp(self): '''SetUp fonction''' diff --git a/words_n_fun/preprocessing/vectorization_tokenization.py b/words_n_fun/preprocessing/vectorization_tokenization.py index cfe9f4b..4f7437d 100644 --- a/words_n_fun/preprocessing/vectorization_tokenization.py +++ b/words_n_fun/preprocessing/vectorization_tokenization.py @@ -30,7 +30,6 @@ logger = logging.getLogger(__name__) -@utils.data_agnostic def split_text_into_tokens(docs: pd.Series, nbech: int = 10, seq_size: int = 3, step: int = 1, granularity: str = "word") -> Tuple[pd.Series, pd.Series]: '''Split an input text into seq_size tokens (word or char) with at most nbech tokens