From 2dd3d4f60ef3233215585167821a3ae3d181d875 Mon Sep 17 00:00:00 2001 From: Ivan Golikov Date: Mon, 2 Sep 2024 13:09:31 +0200 Subject: [PATCH] CSS selectors calculation done separately for each element (#198) * Calculating CSS selectors one-by-one in a shared Selenium session * CSS selectors caching decorator fix * Extending shared Selenoid session idle timeout Doing that to prevent session from being destroyed in-between tasks execution * Locators calculation speedup --- app/css_selectors/__init__.py | 4 +- app/css_selectors/tasks.py | 77 +++++++++++++++-------------------- app/css_selectors/utils.py | 8 ++++ app/selenium_app.py | 19 +++++---- app/tasks.py | 2 +- utils/api_utils.py | 36 +++++++--------- 6 files changed, 67 insertions(+), 79 deletions(-) diff --git a/app/css_selectors/__init__.py b/app/css_selectors/__init__.py index 3e60a991..5161af98 100644 --- a/app/css_selectors/__init__.py +++ b/app/css_selectors/__init__.py @@ -1,8 +1,8 @@ __all__ = [ "CSS_SELECTOR_GEN_TASK_PREFIX", - "task_schedule_css_selectors_generation", + "task_schedule_css_selector_generation", "inject_css_selector_generator_scripts", ] -from .tasks import task_schedule_css_selectors_generation, CSS_SELECTOR_GEN_TASK_PREFIX +from .tasks import task_schedule_css_selector_generation, CSS_SELECTOR_GEN_TASK_PREFIX from .utils import inject_css_selector_generator_scripts diff --git a/app/css_selectors/tasks.py b/app/css_selectors/tasks.py index 87ae3f30..d70ee997 100644 --- a/app/css_selectors/tasks.py +++ b/app/css_selectors/tasks.py @@ -1,11 +1,11 @@ import logging -# from pathlib import Path from typing import List, Dict from app.celery_app import celery_app -from app.selenium_app import get_webdriver, inject_html from app.redis_app import redis_app +from .utils import ExistingRemoteSession + logger = logging.getLogger(__name__) @@ -26,30 +26,16 @@ def wrapper(*args, **kwargs): def _cache_calculations_results(func): def wrapper(*args, **kwargs): - elements_ids = kwargs["elements_ids"] - redis_keys = [f"css-selector-{e}" for e in elements_ids] - cached_selectors = redis_app.mget(redis_keys) - - result = [] - not_cached_elements_ids = [] - - for element_id, selector in zip(elements_ids, cached_selectors): - if selector is None: - not_cached_elements_ids.append(element_id) - else: - logger.info(f"Using cached selector for element {element_id}") - result.append({"id": element_id, "result": selector.decode("utf-8")}) - - if not_cached_elements_ids: - new_kwargs = kwargs.copy() - new_kwargs["elements_ids"] = not_cached_elements_ids - - new_results = func(*args, **new_kwargs) - - result.extend(new_results) - - for new_result in new_results: - redis_app.set(f"css-selector-{new_result['id']}", new_result["result"], ex=60*60*24) + element_id = kwargs["element_id"] + redis_key = f"css-selector-{element_id}" + cached_selector = redis_app.get(redis_key) + + if cached_selector: + logger.info(f"Using cached selector for element {element_id}") + result = [{"id": element_id, "result": cached_selector.decode("utf-8")}] + else: + result = func(*args, **kwargs) + redis_app.set(redis_key, result[0]["result"], ex=60*60*24) return result return wrapper @@ -58,22 +44,23 @@ def wrapper(*args, **kwargs): @celery_app.task(bind=True) @_replace_error_messages("Error generating CSS selectors") @_cache_calculations_results -def task_schedule_css_selectors_generation( - self, document_key: str, elements_ids: List[str] -) -> List[Dict[str, str]]: - driver = get_webdriver() - inject_html(driver, redis_app.get(document_key).decode("utf-8")) - - result = [] - for element_id in elements_ids: - result.append({ - "id": element_id, - "result": driver.execute_script( - f""" - el = document.querySelector('[jdn-hash="{element_id}"]'); - return generateSelectorByElement(el); - """ - ), - }) - - return result +def task_schedule_css_selector_generation(self, session_id: str, element_id: str) -> List[Dict[str, str]]: + """Get CSS selector for element using passed Selenium session. + + :param session_id: Selenium session id + :param element_id: Value of jdn-hash attribute of element for which the CSS selector should be generated + + :returns: List with result dictionary. List is used just to keep compatibility with old API. + """ + driver = ExistingRemoteSession(command_executor="http://selenoid:4444/wd/hub", desired_capabilities=None) + driver.session_id = session_id + + return [{ + "id": element_id, + "result": driver.execute_script( + f""" + el = document.querySelector('[jdn-hash="{element_id}"]'); + return generateSelectorByElement(el); + """ + ), + }] diff --git a/app/css_selectors/utils.py b/app/css_selectors/utils.py index a8dbaf73..141b40a9 100644 --- a/app/css_selectors/utils.py +++ b/app/css_selectors/utils.py @@ -1,4 +1,5 @@ from bs4 import BeautifulSoup +from selenium import webdriver def get_script_text(script_path: str) -> str: @@ -23,3 +24,10 @@ def inject_css_selector_generator_scripts(document: str) -> str: doc_soup.head.append(script_tag) return str(doc_soup) + + +class ExistingRemoteSession(webdriver.Remote): + """Dummy remote webdriver class that don't start new Selenium session""" + def start_session(self, capabilities, browser_profile=None): + # Skip the NEW_SESSION command issued by the original driver + pass diff --git a/app/selenium_app.py b/app/selenium_app.py index 410f3b66..3174a11f 100644 --- a/app/selenium_app.py +++ b/app/selenium_app.py @@ -12,15 +12,12 @@ from utils import config -def get_webdriver() -> webdriver.Remote: +def get_webdriver(extra_capabilities: dict = None) -> webdriver.Remote: """Returns a remote Chrome webdriver instance""" chrome_options = Options() chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--headless") - if config.IS_DEV_SHM_USAGE_DISABLED: - chrome_options.add_argument("--disable-dev-shm-usage") - capabilities = { "browserName": "chrome", "browserVersion": "118.0", @@ -28,12 +25,16 @@ def get_webdriver() -> webdriver.Remote: "enableVideo": False } } + if extra_capabilities: + capabilities.update(extra_capabilities) - return webdriver.Remote( - command_executor="http://selenoid:4444/wd/hub", - desired_capabilities=capabilities, - options=chrome_options, - ) + for name, value in capabilities.items(): + chrome_options.set_capability(name, value) + + if config.IS_DEV_SHM_USAGE_DISABLED: + chrome_options.add_argument("--disable-dev-shm-usage") + + return webdriver.Remote(command_executor="http://selenoid:4444/wd/hub", options=chrome_options) def inject_html(driver: webdriver.Remote, html: str) -> None: diff --git a/app/tasks.py b/app/tasks.py index f7195c5d..4388fa6f 100644 --- a/app/tasks.py +++ b/app/tasks.py @@ -8,7 +8,7 @@ from app.models import ReportMail, RobulaSettingsModel from app.redis_app import redis_app from utils.robula import generate_xpath -from .css_selectors import task_schedule_css_selectors_generation # noqa: F401 +from .css_selectors import task_schedule_css_selector_generation # noqa: F401 ENV = os.getenv("ENV") diff --git a/utils/api_utils.py b/utils/api_utils.py index 10f43493..408b517d 100644 --- a/utils/api_utils.py +++ b/utils/api_utils.py @@ -14,10 +14,8 @@ from app.logger import logger from app.models import LoggingInfoModel, TaskStatusModel, XPathGenerationModel, CSSSelectorGenerationModel from app.redis_app import redis_app -from app.selenium_app import get_chunks_boundaries -from app.tasks import ENV, task_schedule_xpath_generation, task_schedule_css_selectors_generation - -from utils import config as app_config +from app.selenium_app import get_webdriver, inject_html +from app.tasks import ENV, task_schedule_xpath_generation, task_schedule_css_selector_generation def get_task_status(task_id) -> TaskStatusModel: @@ -335,31 +333,24 @@ async def process_incoming_ws_request( generation_data = CSSSelectorGenerationModel(**payload) elements_ids = generation_data.id - document = generation_data.document - random_document_key = str(uuid.uuid4()) - redis_app.set(name=random_document_key, value=inject_css_selector_generator_scripts(document), ex=120) - selectors_generation_results = [] + # Start Selenium session that will be used by Celery workers to generate CSS selectors + # Changing default idle timeout to prevent session from being destroyed by Selenoid + document = inject_css_selector_generator_scripts(generation_data.document) + driver = get_webdriver(extra_capabilities={"sessionTimeout": "30m"}) + inject_html(driver, document) - num_of_tasks = app_config.SELENOID_PARALLEL_SESSIONS_COUNT - jobs_chunks = get_chunks_boundaries(elements_ids, num_of_tasks) - - for start_idx, end_idx in jobs_chunks: - # Due to the implementation of get_chunks_boundaries we can get - # several empty chunks and one chunk with all elements in case when - # len(elements_ids) < num_of_tasks - # We can skip them to avoid sending of basically empty tasks to Celery - if start_idx == end_idx: - continue + selectors_generation_results = [] + for element_id in elements_ids: task_id = convert_task_id_if_exists( - f"{CSS_SELECTOR_GEN_TASK_PREFIX}{uuid.uuid4()}" + f"{CSS_SELECTOR_GEN_TASK_PREFIX}{element_id}" ) task_kwargs = { - "document_key": random_document_key, - "elements_ids": elements_ids[start_idx:end_idx], + "session_id": driver.session_id, + "element_id": element_id, } - task_result_obj = task_schedule_css_selectors_generation.apply_async( + task_result_obj = task_schedule_css_selector_generation.apply_async( kwargs=task_kwargs, task_id=task_id, zpriority=2 ) selectors_generation_results.append(task_result_obj) @@ -377,6 +368,7 @@ async def process_incoming_ws_request( ) ) await asyncio.wait(celery_waiting_tasks) + driver.quit() return result