diff --git a/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java b/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java index 0d77b13..7359537 100644 --- a/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java +++ b/crawler-engine/src/main/java/by/andd3dfx/crawler/dto/SingleSearchResult.java @@ -1,15 +1,6 @@ package by.andd3dfx.crawler.dto; -import lombok.Getter; -import lombok.RequiredArgsConstructor; - import java.util.List; -@Getter -@RequiredArgsConstructor -public class SingleSearchResult { - - private final List dataItems; - private final String prevPageUrl; - private final String nextPageUrl; +public record SingleSearchResult(List dataItems, String nextPageUrl) { } diff --git a/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java b/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java index d06be0e..516a28f 100644 --- a/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java +++ b/crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java @@ -65,11 +65,11 @@ public List batchSearch(String pageUrl, int maxPagesCap, long throttlingDelay while (nextPage != null && (maxPagesCap == -1 || pagesCounter < maxPagesCap)) { SingleSearchResult searchResult = singleSearch(nextPage); - List dataItems = searchResult.getDataItems(); + List dataItems = searchResult.dataItems(); log.info("Hit №{}, {} items retrieved", pagesCounter, dataItems.size()); pagesCounter++; result.addAll(dataItems); - nextPage = searchResult.getNextPageUrl(); + nextPage = searchResult.nextPageUrl(); Thread.sleep(throttlingDelayMs); } @@ -97,17 +97,12 @@ public SingleSearchResult singleSearch(String pageUrl) { .toList(); log.debug("Single search: url={}, items={}", pageUrl, dataItems.size()); - String prevUrl = extractPrevUrl(document); String nextUrl = extractNextUrl(document); - return new SingleSearchResult(dataItems, prevUrl, nextUrl); + return new SingleSearchResult(dataItems, nextUrl); } protected abstract Elements extractElements(Document document); - protected String extractPrevUrl(Document document) { - return null; - } - protected abstract String extractNextUrl(Document document); protected abstract T mapElementToData(Element element); diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java index 8a996a7..b35589f 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java @@ -11,20 +11,20 @@ public class MainApp { + private static final SearchUtil searchUtil = new SearchUtil();; + private static final FileUtil fileUtil = new FileUtil(); + public static void main(String[] args) throws IOException { if (args.length != 2) { - throw new IllegalArgumentException("Should be 2 parameters!"); + throw new IllegalArgumentException("Two 2 params should be provided: paramsFileName & excelFileName!"); } String paramsFileName = args[0]; String excelFileName = args[1]; - var searchUtil = new SearchUtil(); - var fileUtil = new FileUtil(); - var searchItems = new ArrayList(); for (var searchCriteria : fileUtil.loadSearchCriteria(paramsFileName)) { - String startingUrl = searchCriteria.getUrl(); - String label = searchCriteria.getTopic(); + String startingUrl = searchCriteria.url(); + String label = searchCriteria.topic(); var result = searchUtil.batchSearch(startingUrl, -1, 20) .stream() diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java index 18e47b7..56c5e8e 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/BatchSearchResult.java @@ -1,13 +1,7 @@ package by.andd3dfx.pravtor.dto; import java.util.List; -import lombok.Getter; -import lombok.RequiredArgsConstructor; -@Getter -@RequiredArgsConstructor -public class BatchSearchResult { +public record BatchSearchResult(String topic, List dataItems) { - private final String topic; - private final List dataItems; } diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java index eed6e7f..d893162 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/dto/SearchCriteria.java @@ -1,12 +1,5 @@ package by.andd3dfx.pravtor.dto; -import lombok.Getter; -import lombok.RequiredArgsConstructor; +public record SearchCriteria(String topic, String url) { -@Getter -@RequiredArgsConstructor -public class SearchCriteria { - - private final String topic; - private final String url; } diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java index 3979044..562a2ef 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/FileUtil.java @@ -15,12 +15,15 @@ import java.nio.file.Paths; import java.util.List; +/** + * Util to work with files + */ public class FileUtil { public static final String[] HEADER_LABELS = {"Название", "Seeds", "Peers", "Скачано", "Размер", "Ссылка"}; /** - * Load list of search criteria items + * Load list of search criteria items from file * * @param fileName name of params file * @return list of search criteria items @@ -43,7 +46,7 @@ public List loadSearchCriteria(String fileName) throws IOExcepti public void writeIntoExcel(String fileName, List searchItems) throws IOException { try (var book = new HSSFWorkbook();) { searchItems.forEach(searchItem -> { - Sheet sheet = book.createSheet(searchItem.getTopic()); + Sheet sheet = book.createSheet(searchItem.topic()); populateHeaderLabels(sheet); populateContent(sheet, searchItem); @@ -66,7 +69,7 @@ private void populateHeaderLabels(Sheet sheet) { private void populateContent(Sheet sheet, BatchSearchResult searchItem) { int rowsCount = 1; - for (TorrentData dataItem : searchItem.getDataItems()) { + for (TorrentData dataItem : searchItem.dataItems()) { int column_number = 0; Row row = sheet.createRow(rowsCount); row.createCell(column_number++).setCellValue(dataItem.getLabel()); diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java index f1ddd7a..b1f84bf 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java @@ -16,21 +16,25 @@ @Slf4j public class SearchUtil extends WebCrawler { - private static final String PREFIX = "https://pravtor.ru/"; + private static final String BASE_URL = "https://pravtor.ru/"; @Override protected Elements extractElements(Document document) { return document.select("tr[id^=tr-]"); } - @Override - protected String extractPrevUrl(Document document) { - return extractPrevOrNext(document, "Пред."); - } - @Override protected String extractNextUrl(Document document) { - return extractPrevOrNext(document, "След."); + List pageItems = document + .select("td[class=tRight vBottom nowrap small]") + .select("a").stream() + .filter(s -> s.text().contains("След.")) + .toList(); + + if (pageItems.isEmpty()) { + return null; + } + return BASE_URL + pageItems.get(0).attr("href"); } @Override @@ -45,20 +49,11 @@ protected TorrentData mapElementToData(Element element) { .build(); } - private String extractPrevOrNext(Document document, String value) { - List pageItems = document.select("td[class=tRight vBottom nowrap small]") - .select("a").stream() - .filter(s -> s.text().contains(value)) - .toList(); - - if (pageItems.isEmpty()) { - return null; - } - return PREFIX + pageItems.get(0).attr("href"); - } - private String extractLink(String href) { - return StringUtils.isEmpty(href) ? href : PREFIX + href.substring(2); + if (StringUtils.isEmpty(href)) { + return href; + } + return BASE_URL + href.substring(2); } private Integer convertToInteger(String value) { diff --git a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/MainAppTest.java b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/MainAppTest.java index 5751491..b79cceb 100644 --- a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/MainAppTest.java +++ b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/MainAppTest.java @@ -74,7 +74,7 @@ private void runMainNCheckExceptionThrow(String[] args) throws Exception { mainApp.main(args); fail("Exception should be thrown"); } catch (IllegalArgumentException iae) { - assertThat(iae.getMessage(), is("Should be 2 parameters!")); + assertThat(iae.getMessage(), is("Two 2 params should be provided: paramsFileName & excelFileName!")); } } } \ No newline at end of file diff --git a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/FileUtilTest.java b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/FileUtilTest.java index bfef473..78dcc47 100644 --- a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/FileUtilTest.java +++ b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/FileUtilTest.java @@ -34,11 +34,11 @@ public void loadSearchCriteria() throws IOException { assertThat("Wrong count of criteria items", criteriaItems.size(), is(2)); var item0 = criteriaItems.get(0); - assertThat("Wrong url of first item", item0.getTopic(), is("txt-molitvy")); - assertThat("Wrong label of first item", item0.getUrl(), is("https://pravtor.ru/viewforum.php?f=184")); + assertThat("Wrong url of first item", item0.topic(), is("txt-molitvy")); + assertThat("Wrong label of first item", item0.url(), is("https://pravtor.ru/viewforum.php?f=184")); var item1 = criteriaItems.get(1); - assertThat("Wrong label of second item", item1.getTopic(), is("txt-kanony")); - assertThat("Wrong url of second item", item1.getUrl(), is("https://pravtor.ru/viewforum.php?f=183")); + assertThat("Wrong label of second item", item1.topic(), is("txt-kanony")); + assertThat("Wrong url of second item", item1.url(), is("https://pravtor.ru/viewforum.php?f=183")); } @Test diff --git a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/SearchUtilTest.java b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/SearchUtilTest.java index 591ba00..d61394d 100644 --- a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/SearchUtilTest.java +++ b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/SearchUtilTest.java @@ -31,6 +31,6 @@ public void batchSearch() { public void singleSearch() { var result = searchUtil.singleSearch(STARTING_URL); - assertThat("Wrong amount of result records", result.getDataItems().size(), is(50)); + assertThat("Wrong amount of result records", result.dataItems().size(), is(50)); } } diff --git a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java index 790f77c..c33bb7f 100644 --- a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java +++ b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java @@ -16,7 +16,7 @@ public static void main(String[] args) throws IOException { } var searchUtil = new SearchUtil(); - var pageUrl = searchUtil.buildSearchUrl("java"); + var pageUrl = searchUtil.buildStartingSearchUrl("java"); var searchResult = searchUtil.batchSearch(pageUrl); var statisticsSortedMap = new StatisticsUtil().collectStatistics(searchResult); diff --git a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/util/SearchUtil.java b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/util/SearchUtil.java index 72972b8..72cd9c0 100644 --- a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/util/SearchUtil.java +++ b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/util/SearchUtil.java @@ -2,24 +2,24 @@ import by.andd3dfx.crawler.engine.WebCrawler; import by.andd3dfx.rabotaby.dto.VacancyData; +import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; -import java.io.IOException; import java.util.Arrays; import java.util.stream.Collectors; @Slf4j public class SearchUtil extends WebCrawler { - private final String URL_PREFIX = "http://rabota.by"; - private final String searchUrlFormat = URL_PREFIX + "/search/vacancy?area=1002&text=%s&page=%d"; + private final String BASE_URL = "http://rabota.by"; - public String buildSearchUrl(String searchString) { - return String.format(searchUrlFormat, searchString, 0); + public String buildStartingSearchUrl(String searchString) { + final var format = BASE_URL + "/search/vacancy?area=1002&text=%s&page=%d"; + return String.format(format, searchString, 0); } @Override @@ -33,21 +33,17 @@ protected String extractNextUrl(Document document) { if (nextPageItem.isEmpty()) { return null; } - return URL_PREFIX + nextPageItem.attr("href"); + return BASE_URL + nextPageItem.attr("href"); } + @SneakyThrows @Override protected VacancyData mapElementToData(Element element) { String searchUrl = element.select("a").attr("href"); log.info("Retrieve vacancy details for {}", searchUrl); - Document document; - try { - document = Jsoup + Document document = Jsoup .connect(searchUrl) .userAgent(USER_AGENT).get(); - } catch (IOException e) { - throw new RuntimeException("Retrieve details failed", e); - } return VacancyData.builder() .url(document.baseUri()) diff --git a/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/util/SearchUtilTest.java b/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/util/SearchUtilTest.java index a576ea4..f58bdae 100644 --- a/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/util/SearchUtilTest.java +++ b/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/util/SearchUtilTest.java @@ -21,17 +21,17 @@ public void setup() { @Test public void singleSearch() { - var pageUrl = searchUtil.buildSearchUrl("java"); + var pageUrl = searchUtil.buildStartingSearchUrl("java"); var result = searchUtil.singleSearch(pageUrl); - assertThat("Next url should be present", result.getNextPageUrl(), is( + assertThat("Next url should be present", result.nextPageUrl(), is( "http://rabota.by/search/vacancy?area=1002&text=java&page=1&hhtmFrom=vacancy_search_list")); - assertThat("At least 20 items expected", result.getDataItems().size(), greaterThanOrEqualTo(RECORDS_PER_PAGE)); + assertThat("At least 20 items expected", result.dataItems().size(), greaterThanOrEqualTo(RECORDS_PER_PAGE)); } @Test public void batchSearch() { - var pageUrl = searchUtil.buildSearchUrl("java"); + var pageUrl = searchUtil.buildStartingSearchUrl("java"); var searchResult = searchUtil.batchSearch(pageUrl, 2); assertThat(searchResult.size(), is(2 * RECORDS_PER_PAGE));