diff --git a/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java b/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java index 7359537..7b0f121 100644 --- a/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java +++ b/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java @@ -2,5 +2,12 @@ import java.util.List; +/** + * DTO to store results of single search + * + * @param dataItems list of data items with type T + * @param nextPageUrl URL of next page + * @param data items type + */ public record SingleSearchResult(List dataItems, String nextPageUrl) { } diff --git a/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java b/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java index f0f394d..50391e1 100644 --- a/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java +++ b/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java @@ -16,7 +16,7 @@ * Web crawler for retrieving list of data with type T by consequent visiting of pages. * Process started using provided starting page URL; next link on each page retrieved from current page. * - * @param data type + * @param data items type */ @Slf4j public abstract class WebCrawler { @@ -25,6 +25,53 @@ public abstract class WebCrawler { private static final int DEFAULT_MAX_PAGES_CAP = 10; private static final long DEFAULT_THROTTLING_DELAY_MS = 20; + /** + * Search and extract data from page with provided URL + * + * @param pageUrl URL of page + * @return search result + */ + @SneakyThrows + public SingleSearchResult singleSearch(String pageUrl) { + Document document = Jsoup + .connect(pageUrl) + .userAgent(USER_AGENT).get(); + + Elements elements = extractElements(document); + + List dataItems = elements.stream() + .map(this::mapElementToData) + .toList(); + log.debug("Single search: url={}, items={}", pageUrl, dataItems.size()); + + String nextUrl = extractNextUrl(document); + return new SingleSearchResult<>(dataItems, nextUrl); + } + + /** + * Extract elements from parsed Jsoup document + * + * @param document Jsoup document + * @return extracted elements + */ + protected abstract Elements extractElements(Document document); + + /** + * Extract next URL from parsed Jsoup document + * + * @param document Jsoup document + * @return next URL + */ + protected abstract String extractNextUrl(Document document); + + /** + * Map element to result DTO object + * + * @param element + * @return DTO object of type T + */ + protected abstract T mapElementToData(Element element); + /** * Batch search using provided starting page URL, max pages cap 10 and throttling delay 20ms * @@ -79,33 +126,4 @@ public List batchSearch(String pageUrl, int maxPagesCap, long throttlingDelay return result; } - - /** - * Search and extract data from page with provided URL - * - * @param pageUrl URL of page - * @return search result - */ - @SneakyThrows - public SingleSearchResult singleSearch(String pageUrl) { - Document document = Jsoup - .connect(pageUrl) - .userAgent(USER_AGENT).get(); - - Elements elements = extractElements(document); - - List dataItems = elements.stream() - .map(this::mapElementToData) - .toList(); - log.debug("Single search: url={}, items={}", pageUrl, dataItems.size()); - - String nextUrl = extractNextUrl(document); - return new SingleSearchResult(dataItems, nextUrl); - } - - protected abstract Elements extractElements(Document document); - - protected abstract String extractNextUrl(Document document); - - protected abstract T mapElementToData(Element element); } diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java index 91a79c3..b02a71f 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java @@ -24,14 +24,14 @@ public static void main(String[] args) throws IOException { var searchItems = new ArrayList(); for (var searchCriteria : fileUtil.loadSearchCriteria(paramsFileName)) { String startingUrl = searchCriteria.url(); - String label = searchCriteria.topic(); + String topic = searchCriteria.topic(); var result = crawler.batchSearch(startingUrl, -1, 20) .stream() .filter(torrentData -> torrentData.getDownloadedCount() != null) .sorted(Comparator.comparingInt(TorrentData::getDownloadedCount).reversed()) .toList(); - searchItems.add(new BatchSearchResult(label, result)); + searchItems.add(new BatchSearchResult(topic, result)); } fileUtil.writeIntoExcel(excelFileName, searchItems); diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java index 56c5e8e..2ddbe04 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java @@ -2,6 +2,12 @@ import java.util.List; +/** + * DTO to store results of batch search + * + * @param topic name of topic + * @param dataItems data items + */ public record BatchSearchResult(String topic, List dataItems) { } diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java index d893162..9aa174f 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java @@ -1,5 +1,11 @@ package by.andd3dfx.pravtor.dto; +/** + * Search criteria + * + * @param topic name of topic + * @param url starting page URL + */ public record SearchCriteria(String topic, String url) { } diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java index 562a2ef..5a861c7 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java @@ -16,14 +16,14 @@ import java.util.List; /** - * Util to work with files + * Util to work with files of types: text & excel */ public class FileUtil { public static final String[] HEADER_LABELS = {"Название", "Seeds", "Peers", "Скачано", "Размер", "Ссылка"}; /** - * Load list of search criteria items from file + * Load list of search criteria items from plain text file * * @param fileName name of params file * @return list of search criteria items diff --git a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotaByWebCrawler.java b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotaByWebCrawler.java index 6201c4c..301bb3e 100644 --- a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotaByWebCrawler.java +++ b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotaByWebCrawler.java @@ -62,12 +62,6 @@ protected VacancyData mapElementToData(Element element) { .build(); } - private String extractSalary(Document document) { - return StringUtils.trimToNull( - document.select("span[data-qa=vacancy-salary-compensation-type-net]").text() - ); - } - private static String extractCompanyName(Document document) { var elements = document.select("a[data-qa=vacancy-company-name]"); if (elements.isEmpty()) { @@ -80,6 +74,12 @@ private static String extractTextContent(Document document) { return document.select("div[data-qa=vacancy-description]").text(); } + private String extractSalary(Document document) { + return StringUtils.trimToNull( + document.select("span[data-qa=vacancy-salary-compensation-type-net]").text() + ); + } + private static Set extractKeywords(Document document) { return document.select("li[data-qa=skills-element]") .stream()