From c4811ae63ffceae86f4e0abe53c4ba045d668270 Mon Sep 17 00:00:00 2001 From: JJMerchante Date: Mon, 8 Mar 2021 17:57:47 +0100 Subject: [PATCH] [stackexchange] Make author anonymization optional for stackexchange This code makes it optional to pseudo-anonymize the owner of question and answers. It creates a hash of the name and user_id and remove author_link and profile_image. For enabling the anonymization, include the following parameter in the stackexchange section: ``` [stackexchange] ... anonymize = true ``` Signed-off-by: JJMerchante --- grimoire_elk/identities/git.py | 2 +- grimoire_elk/identities/github.py | 2 +- grimoire_elk/identities/identities.py | 2 +- grimoire_elk/identities/meetup.py | 2 +- grimoire_elk/identities/stackexchange.py | 54 ++++++++++++++++++++++++ grimoire_elk/raw/stackexchange.py | 2 + tests/test_stackexchange.py | 37 ++++++++++++++++ 7 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 grimoire_elk/identities/stackexchange.py diff --git a/grimoire_elk/identities/git.py b/grimoire_elk/identities/git.py index 67d9fdaa3..b2816ef4c 100644 --- a/grimoire_elk/identities/git.py +++ b/grimoire_elk/identities/git.py @@ -16,7 +16,7 @@ # along with this program. If not, see . # # Authors: -# Jose Javier Merchante Picazo +# Jose Javier Merchante Picazo # from grimoire_elk.identities.identities import Identities diff --git a/grimoire_elk/identities/github.py b/grimoire_elk/identities/github.py index 390914ba5..94bac052e 100644 --- a/grimoire_elk/identities/github.py +++ b/grimoire_elk/identities/github.py @@ -16,7 +16,7 @@ # along with this program. If not, see . # # Authors: -# Jose Javier Merchante Picazo +# Jose Javier Merchante Picazo # from grimoire_elk.identities.identities import Identities diff --git a/grimoire_elk/identities/identities.py b/grimoire_elk/identities/identities.py index 06a41994e..317e4915e 100644 --- a/grimoire_elk/identities/identities.py +++ b/grimoire_elk/identities/identities.py @@ -16,7 +16,7 @@ # along with this program. If not, see . # # Authors: -# Jose Javier Merchante Picazo +# Jose Javier Merchante Picazo # import hashlib diff --git a/grimoire_elk/identities/meetup.py b/grimoire_elk/identities/meetup.py index 3f9a00c2b..d2e86a453 100644 --- a/grimoire_elk/identities/meetup.py +++ b/grimoire_elk/identities/meetup.py @@ -16,7 +16,7 @@ # along with this program. If not, see . # # Authors: -# Jose Javier Merchante Picazo +# Jose Javier Merchante Picazo # from grimoire_elk.identities.identities import Identities diff --git a/grimoire_elk/identities/stackexchange.py b/grimoire_elk/identities/stackexchange.py new file mode 100644 index 000000000..f581c2196 --- /dev/null +++ b/grimoire_elk/identities/stackexchange.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2015-2020 Bitergia +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# Authors: +# Jose Javier Merchante Picazo +# + +from grimoire_elk.identities.identities import Identities + + +class StackExchangeIdentities(Identities): + @classmethod + def anonymize_item(cls, item): + """Remove or hash the fields that contain personal information + + Comments are removed because can cause complexity, there could be many + and are not used in the enrichment process + """ + + item = item['data'] + + item['comments'] = [] + if 'owner' in item and item['owner']: + cls._sanitize_owner(item['owner']) + + if 'answers' in item and item['answers']: + for answer in item['answers']: + if 'owner' in answer and answer['owner']: + cls._sanitize_owner(answer['owner']) + answer['comments'] = [] + + @classmethod + def _sanitize_owner(cls, owner): + """Remove links and hash personal information""" + if 'display_name' in owner: + owner['display_name'] = cls._hash(owner['display_name']) + if 'user_id' in owner: + owner['user_id'] = cls._hash(str(owner['user_id'])) + owner['profile_image'] = '' + owner['link'] = '' diff --git a/grimoire_elk/raw/stackexchange.py b/grimoire_elk/raw/stackexchange.py index 93fcaeca3..17bc5ba8e 100644 --- a/grimoire_elk/raw/stackexchange.py +++ b/grimoire_elk/raw/stackexchange.py @@ -21,6 +21,7 @@ from .elastic import ElasticOcean from ..elastic_mapping import Mapping as BaseMapping +from ..identities.stackexchange import StackExchangeIdentities class Mapping(BaseMapping): @@ -64,6 +65,7 @@ class StackExchangeOcean(ElasticOcean): """StackExchange Ocean feeder""" mapping = Mapping + identities = StackExchangeIdentities @classmethod def get_perceval_params_from_url(cls, url): diff --git a/tests/test_stackexchange.py b/tests/test_stackexchange.py index 0f89b50d2..06a8c8505 100644 --- a/tests/test_stackexchange.py +++ b/tests/test_stackexchange.py @@ -32,7 +32,9 @@ class TestStackexchange(TestBaseBackend): connector = "stackexchange" ocean_index = "test_" + connector + ocean_index_anonymized = "test_" + connector + "_anonymized" enrich_index = "test_" + connector + "_enrich" + enrich_index_anonymized = "test_" + connector + "_enrich_anonymized" def test_has_identites(self): """Test value of has_identities method""" @@ -129,6 +131,41 @@ def test_copy_raw_fields(self): else: self.assertIsNone(eitem[attribute]) + def test_items_to_raw_anonymized(self): + """Test whether JSON items are properly inserted into ES anonymized""" + + result = self._test_items_to_raw_anonymized() + + self.assertEqual(result['items'], 3) + self.assertEqual(result['raw'], 3) + + item = self.items[0]['data'] + self.assertEqual(item['owner']['display_name'], '80490d00f668dde48d4e0ce62142c8a2ac9a1465') + self.assertEqual(item['owner']['user_id'], '182b39d390fc9fde7594184cbe6e6f8653cfd5b2') + self.assertEqual(item['owner']['link'], '') + self.assertEqual(item['owner']['profile_image'], '') + self.assertEqual(len(item['comments']), 0) + self.assertEqual(item['answers'][0]['owner']['display_name'], '0d2244465bfc8b636bf1fbe74912cc2c748b42e4') + self.assertEqual(item['answers'][0]['owner']['user_id'], 'c7b7c5dea6f6a1a4531bf491b207d123ca41da4c') + self.assertEqual(item['answers'][0]['owner']['link'], '') + self.assertEqual(item['answers'][0]['owner']['profile_image'], '') + self.assertEqual(len(item['answers'][0]['comments']), 0) + + def test_raw_to_enrich_anonymized(self): + """Test whether the raw index is properly enriched""" + + result = self._test_raw_to_enrich_anonymized() + + self.assertEqual(result['raw'], 3) + self.assertEqual(result['enrich'], 6) + + enrich_backend = self.connectors[self.connector][2]() + + item = self.items[0] + eitem = enrich_backend.get_rich_item(item) + self.assertEqual(eitem['author'], '80490d00f668dde48d4e0ce62142c8a2ac9a1465') + self.assertEqual(eitem['author_link'], '') + if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')