Skip to content

Commit

Permalink
CSS selectors calculation done separately for each element (#198)
Browse files Browse the repository at this point in the history
* Calculating CSS selectors one-by-one in a shared Selenium session

* CSS selectors caching decorator fix

* Extending shared Selenoid session idle timeout

Doing that to prevent session from being destroyed in-between tasks execution

* Locators calculation speedup
  • Loading branch information
ivnglkv authored Sep 2, 2024
1 parent 8aa85fd commit 2dd3d4f
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 79 deletions.
4 changes: 2 additions & 2 deletions app/css_selectors/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
__all__ = [
"CSS_SELECTOR_GEN_TASK_PREFIX",
"task_schedule_css_selectors_generation",
"task_schedule_css_selector_generation",
"inject_css_selector_generator_scripts",
]

from .tasks import task_schedule_css_selectors_generation, CSS_SELECTOR_GEN_TASK_PREFIX
from .tasks import task_schedule_css_selector_generation, CSS_SELECTOR_GEN_TASK_PREFIX
from .utils import inject_css_selector_generator_scripts
77 changes: 32 additions & 45 deletions app/css_selectors/tasks.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import logging
# from pathlib import Path
from typing import List, Dict

from app.celery_app import celery_app
from app.selenium_app import get_webdriver, inject_html
from app.redis_app import redis_app

from .utils import ExistingRemoteSession


logger = logging.getLogger(__name__)

Expand All @@ -26,30 +26,16 @@ def wrapper(*args, **kwargs):

def _cache_calculations_results(func):
def wrapper(*args, **kwargs):
elements_ids = kwargs["elements_ids"]
redis_keys = [f"css-selector-{e}" for e in elements_ids]
cached_selectors = redis_app.mget(redis_keys)

result = []
not_cached_elements_ids = []

for element_id, selector in zip(elements_ids, cached_selectors):
if selector is None:
not_cached_elements_ids.append(element_id)
else:
logger.info(f"Using cached selector for element {element_id}")
result.append({"id": element_id, "result": selector.decode("utf-8")})

if not_cached_elements_ids:
new_kwargs = kwargs.copy()
new_kwargs["elements_ids"] = not_cached_elements_ids

new_results = func(*args, **new_kwargs)

result.extend(new_results)

for new_result in new_results:
redis_app.set(f"css-selector-{new_result['id']}", new_result["result"], ex=60*60*24)
element_id = kwargs["element_id"]
redis_key = f"css-selector-{element_id}"
cached_selector = redis_app.get(redis_key)

if cached_selector:
logger.info(f"Using cached selector for element {element_id}")
result = [{"id": element_id, "result": cached_selector.decode("utf-8")}]
else:
result = func(*args, **kwargs)
redis_app.set(redis_key, result[0]["result"], ex=60*60*24)
return result

return wrapper
Expand All @@ -58,22 +44,23 @@ def wrapper(*args, **kwargs):
@celery_app.task(bind=True)
@_replace_error_messages("Error generating CSS selectors")
@_cache_calculations_results
def task_schedule_css_selectors_generation(
self, document_key: str, elements_ids: List[str]
) -> List[Dict[str, str]]:
driver = get_webdriver()
inject_html(driver, redis_app.get(document_key).decode("utf-8"))

result = []
for element_id in elements_ids:
result.append({
"id": element_id,
"result": driver.execute_script(
f"""
el = document.querySelector('[jdn-hash="{element_id}"]');
return generateSelectorByElement(el);
"""
),
})

return result
def task_schedule_css_selector_generation(self, session_id: str, element_id: str) -> List[Dict[str, str]]:
"""Get CSS selector for element using passed Selenium session.
:param session_id: Selenium session id
:param element_id: Value of jdn-hash attribute of element for which the CSS selector should be generated
:returns: List with result dictionary. List is used just to keep compatibility with old API.
"""
driver = ExistingRemoteSession(command_executor="http://selenoid:4444/wd/hub", desired_capabilities=None)
driver.session_id = session_id

return [{
"id": element_id,
"result": driver.execute_script(
f"""
el = document.querySelector('[jdn-hash="{element_id}"]');
return generateSelectorByElement(el);
"""
),
}]
8 changes: 8 additions & 0 deletions app/css_selectors/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from bs4 import BeautifulSoup
from selenium import webdriver


def get_script_text(script_path: str) -> str:
Expand All @@ -23,3 +24,10 @@ def inject_css_selector_generator_scripts(document: str) -> str:
doc_soup.head.append(script_tag)

return str(doc_soup)


class ExistingRemoteSession(webdriver.Remote):
"""Dummy remote webdriver class that don't start new Selenium session"""
def start_session(self, capabilities, browser_profile=None):
# Skip the NEW_SESSION command issued by the original driver
pass
19 changes: 10 additions & 9 deletions app/selenium_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,29 @@
from utils import config


def get_webdriver() -> webdriver.Remote:
def get_webdriver(extra_capabilities: dict = None) -> webdriver.Remote:
"""Returns a remote Chrome webdriver instance"""
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--headless")

if config.IS_DEV_SHM_USAGE_DISABLED:
chrome_options.add_argument("--disable-dev-shm-usage")

capabilities = {
"browserName": "chrome",
"browserVersion": "118.0",
"selenoid:options": {
"enableVideo": False
}
}
if extra_capabilities:
capabilities.update(extra_capabilities)

return webdriver.Remote(
command_executor="http://selenoid:4444/wd/hub",
desired_capabilities=capabilities,
options=chrome_options,
)
for name, value in capabilities.items():
chrome_options.set_capability(name, value)

if config.IS_DEV_SHM_USAGE_DISABLED:
chrome_options.add_argument("--disable-dev-shm-usage")

return webdriver.Remote(command_executor="http://selenoid:4444/wd/hub", options=chrome_options)


def inject_html(driver: webdriver.Remote, html: str) -> None:
Expand Down
2 changes: 1 addition & 1 deletion app/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from app.models import ReportMail, RobulaSettingsModel
from app.redis_app import redis_app
from utils.robula import generate_xpath
from .css_selectors import task_schedule_css_selectors_generation # noqa: F401
from .css_selectors import task_schedule_css_selector_generation # noqa: F401

ENV = os.getenv("ENV")

Expand Down
36 changes: 14 additions & 22 deletions utils/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@
from app.logger import logger
from app.models import LoggingInfoModel, TaskStatusModel, XPathGenerationModel, CSSSelectorGenerationModel
from app.redis_app import redis_app
from app.selenium_app import get_chunks_boundaries
from app.tasks import ENV, task_schedule_xpath_generation, task_schedule_css_selectors_generation

from utils import config as app_config
from app.selenium_app import get_webdriver, inject_html
from app.tasks import ENV, task_schedule_xpath_generation, task_schedule_css_selector_generation


def get_task_status(task_id) -> TaskStatusModel:
Expand Down Expand Up @@ -335,31 +333,24 @@ async def process_incoming_ws_request(
generation_data = CSSSelectorGenerationModel(**payload)
elements_ids = generation_data.id

document = generation_data.document
random_document_key = str(uuid.uuid4())
redis_app.set(name=random_document_key, value=inject_css_selector_generator_scripts(document), ex=120)
selectors_generation_results = []
# Start Selenium session that will be used by Celery workers to generate CSS selectors
# Changing default idle timeout to prevent session from being destroyed by Selenoid
document = inject_css_selector_generator_scripts(generation_data.document)
driver = get_webdriver(extra_capabilities={"sessionTimeout": "30m"})
inject_html(driver, document)

num_of_tasks = app_config.SELENOID_PARALLEL_SESSIONS_COUNT
jobs_chunks = get_chunks_boundaries(elements_ids, num_of_tasks)

for start_idx, end_idx in jobs_chunks:
# Due to the implementation of get_chunks_boundaries we can get
# several empty chunks and one chunk with all elements in case when
# len(elements_ids) < num_of_tasks
# We can skip them to avoid sending of basically empty tasks to Celery
if start_idx == end_idx:
continue
selectors_generation_results = []

for element_id in elements_ids:
task_id = convert_task_id_if_exists(
f"{CSS_SELECTOR_GEN_TASK_PREFIX}{uuid.uuid4()}"
f"{CSS_SELECTOR_GEN_TASK_PREFIX}{element_id}"
)
task_kwargs = {
"document_key": random_document_key,
"elements_ids": elements_ids[start_idx:end_idx],
"session_id": driver.session_id,
"element_id": element_id,
}

task_result_obj = task_schedule_css_selectors_generation.apply_async(
task_result_obj = task_schedule_css_selector_generation.apply_async(
kwargs=task_kwargs, task_id=task_id, zpriority=2
)
selectors_generation_results.append(task_result_obj)
Expand All @@ -377,6 +368,7 @@ async def process_incoming_ws_request(
)
)
await asyncio.wait(celery_waiting_tasks)
driver.quit()

return result

Expand Down

0 comments on commit 2dd3d4f

Please sign in to comment.