From e645375a3413542b97f5924790e6002cb9b87e69 Mon Sep 17 00:00:00 2001 From: Andrei Punko Date: Tue, 5 Nov 2024 00:12:04 +0300 Subject: [PATCH] Put crawlers into `crawler` package & rename them. Adjust paths in README --- README.md | 25 +++++++++++-------- .../java/by/andd3dfx/pravtor/MainApp.java | 6 ++--- .../PravtorWebCrawler.java} | 4 +-- .../PravtorWebCrawlerTest.java} | 12 ++++----- .../java/by/andd3dfx/rabotaby/MainApp.java | 8 +++--- .../RabotabyWebCrawler.java} | 4 +-- .../RabotabyWebCrawlerTest.java} | 18 ++++++------- 7 files changed, 40 insertions(+), 37 deletions(-) rename pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/{util/SearchUtil.java => crawler/PravtorWebCrawler.java} (92%) rename pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/{util/SearchUtilTest.java => crawler/PravtorWebCrawlerTest.java} (67%) rename rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/{util/SearchUtil.java => crawler/RabotabyWebCrawler.java} (92%) rename rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/{util/SearchUtilTest.java => crawler/RabotabyWebCrawlerTest.java} (60%) diff --git a/README.md b/README.md index 1c64788..9c24cd8 100644 --- a/README.md +++ b/README.md @@ -1,32 +1,37 @@ - # Collection of Java-based web crawlers + [![Java CI with Maven](https://github.com/andrei-punko/java-crawlers/actions/workflows/maven.yml/badge.svg)](https://github.com/andrei-punko/java-crawlers/actions/workflows/maven.yml) ## Prerequisites + - Maven 3 - JDK 21 ## How to build + ``` mvn clean install ``` ## Common crawler functionality -- Your crawler should extend [WebCrawler](crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java) -base crawler class -- DTO class which describes collected data should implement -[CrawlerData](crawler-engine/src/main/java/by/andd3dfx/crawler/dto/CrawlerData.java) marker interface + +- Your crawler should extend [WebCrawler](crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java) + base crawler class +- DTO class which describes collected data should implement + [CrawlerData](crawler-engine/src/main/java/by/andd3dfx/crawler/dto/CrawlerData.java) marker interface ## Crawler for Orthodox torrent tracker [pravtor.ru](http://pravtor.ru) -Check [SearchUtil](pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java) + +Check [PravtorWebCrawler](pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawler.java) in `pravtor.ru-crawler` module for details -To make search - run [run-search.bat](pravtor.ru-crawler/run-search.bat) script. +To make search - use [run-search.bat](pravtor.ru-crawler/run-search.bat) script. Collected data will be placed into [result.xls](pravtor.ru-crawler/sandbox/result.xls) file in `sandbox` folder -## Crawler for vacancies aggregator [rabota.by / hh.ru](http://rabota.by) -Check [SearchUtil](rabota.by-crawler/src/main/java/by/andd3dfx/sitesparsing/rabotaby/SearchUtil.java) +## Crawler for vacancies aggregator [rabota.by](http://rabota.by) (it's localized version of [hh.ru](http://hh.ru) in Belarus) + +Check [RabotabyWebCrawler](rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawler.java) in `rabota.by-crawler` module for details -To make search - run `main()` method of [MainApp](rabota.by-crawler/src/main/java/by/andd3dfx/sitesparsing/rabotaby/MainApp.java) +To make search - run `main()` method of [MainApp](rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java) class with populated output path in command line param diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java index b35589f..f5e7554 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/MainApp.java @@ -3,7 +3,7 @@ import by.andd3dfx.pravtor.dto.BatchSearchResult; import by.andd3dfx.pravtor.dto.TorrentData; import by.andd3dfx.pravtor.util.FileUtil; -import by.andd3dfx.pravtor.util.SearchUtil; +import by.andd3dfx.pravtor.crawler.PravtorWebCrawler; import java.io.IOException; import java.util.ArrayList; @@ -11,7 +11,7 @@ public class MainApp { - private static final SearchUtil searchUtil = new SearchUtil();; + private static final PravtorWebCrawler crawler = new PravtorWebCrawler();; private static final FileUtil fileUtil = new FileUtil(); public static void main(String[] args) throws IOException { @@ -26,7 +26,7 @@ public static void main(String[] args) throws IOException { String startingUrl = searchCriteria.url(); String label = searchCriteria.topic(); - var result = searchUtil.batchSearch(startingUrl, -1, 20) + var result = crawler.batchSearch(startingUrl, -1, 20) .stream() .filter(torrentData -> torrentData.getDownloadedCount() != null) .sorted(Comparator.comparingInt(TorrentData::getDownloadedCount).reversed()) diff --git a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawler.java similarity index 92% rename from pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java rename to pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawler.java index b1f84bf..92905e4 100644 --- a/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java +++ b/pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawler.java @@ -1,4 +1,4 @@ -package by.andd3dfx.pravtor.util; +package by.andd3dfx.pravtor.crawler; import by.andd3dfx.crawler.engine.WebCrawler; import by.andd3dfx.pravtor.dto.TorrentData; @@ -14,7 +14,7 @@ * Util to perform search on pravtor.ru torrent tracker */ @Slf4j -public class SearchUtil extends WebCrawler { +public class PravtorWebCrawler extends WebCrawler { private static final String BASE_URL = "https://pravtor.ru/"; diff --git a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/SearchUtilTest.java b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawlerTest.java similarity index 67% rename from pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/SearchUtilTest.java rename to pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawlerTest.java index d61394d..7d3007c 100644 --- a/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/util/SearchUtilTest.java +++ b/pravtor.ru-crawler/src/test/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawlerTest.java @@ -1,4 +1,4 @@ -package by.andd3dfx.pravtor.util; +package by.andd3dfx.pravtor.crawler; import by.andd3dfx.pravtor.dto.TorrentData; import org.junit.Before; @@ -9,27 +9,27 @@ import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; -public class SearchUtilTest { +public class PravtorWebCrawlerTest { private final String STARTING_URL = "https://pravtor.ru/viewforum.php?f=28"; // Святоотеческие тексты и жития святых - private SearchUtil searchUtil; + private PravtorWebCrawler crawler; @Before public void setup() { - searchUtil = new SearchUtil(); + crawler = new PravtorWebCrawler(); } @Test public void batchSearch() { - List result = searchUtil.batchSearch(STARTING_URL, 2, 20); + List result = crawler.batchSearch(STARTING_URL, 2, 20); assertThat("Wrong amount of result records", result.size(), is(100)); } @Test public void singleSearch() { - var result = searchUtil.singleSearch(STARTING_URL); + var result = crawler.singleSearch(STARTING_URL); assertThat("Wrong amount of result records", result.dataItems().size(), is(50)); } diff --git a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java index c33bb7f..e5f9345 100644 --- a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java +++ b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java @@ -1,6 +1,6 @@ package by.andd3dfx.rabotaby; -import by.andd3dfx.rabotaby.util.SearchUtil; +import by.andd3dfx.rabotaby.crawler.RabotabyWebCrawler; import by.andd3dfx.rabotaby.util.StatisticsUtil; import java.io.IOException; @@ -15,9 +15,9 @@ public static void main(String[] args) throws IOException { throw new IllegalArgumentException("Path to output file should be populated!"); } - var searchUtil = new SearchUtil(); - var pageUrl = searchUtil.buildStartingSearchUrl("java"); - var searchResult = searchUtil.batchSearch(pageUrl); + var crawler = new RabotabyWebCrawler(); + var pageUrl = crawler.buildStartingSearchUrl("java"); + var searchResult = crawler.batchSearch(pageUrl); var statisticsSortedMap = new StatisticsUtil().collectStatistics(searchResult); diff --git a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/util/SearchUtil.java b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawler.java similarity index 92% rename from rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/util/SearchUtil.java rename to rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawler.java index 72cd9c0..065ff0a 100644 --- a/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/util/SearchUtil.java +++ b/rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawler.java @@ -1,4 +1,4 @@ -package by.andd3dfx.rabotaby.util; +package by.andd3dfx.rabotaby.crawler; import by.andd3dfx.crawler.engine.WebCrawler; import by.andd3dfx.rabotaby.dto.VacancyData; @@ -13,7 +13,7 @@ import java.util.stream.Collectors; @Slf4j -public class SearchUtil extends WebCrawler { +public class RabotabyWebCrawler extends WebCrawler { private final String BASE_URL = "http://rabota.by"; diff --git a/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/util/SearchUtilTest.java b/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawlerTest.java similarity index 60% rename from rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/util/SearchUtilTest.java rename to rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawlerTest.java index f58bdae..29208a3 100644 --- a/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/util/SearchUtilTest.java +++ b/rabota.by-crawler/src/test/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawlerTest.java @@ -1,4 +1,4 @@ -package by.andd3dfx.rabotaby.util; +package by.andd3dfx.rabotaby.crawler; import org.junit.Before; import org.junit.Test; @@ -7,22 +7,20 @@ import static org.hamcrest.MatcherAssert.assertThat; import static org.hamcrest.Matchers.greaterThanOrEqualTo; -public class SearchUtilTest { +public class RabotabyWebCrawlerTest { private static final int RECORDS_PER_PAGE = 20; - private SearchUtil searchUtil; - private StatisticsUtil statisticsUtil; + private RabotabyWebCrawler crawler; @Before public void setup() { - searchUtil = new SearchUtil(); - statisticsUtil = new StatisticsUtil(); + crawler = new RabotabyWebCrawler(); } @Test public void singleSearch() { - var pageUrl = searchUtil.buildStartingSearchUrl("java"); - var result = searchUtil.singleSearch(pageUrl); + var pageUrl = crawler.buildStartingSearchUrl("java"); + var result = crawler.singleSearch(pageUrl); assertThat("Next url should be present", result.nextPageUrl(), is( "http://rabota.by/search/vacancy?area=1002&text=java&page=1&hhtmFrom=vacancy_search_list")); @@ -31,8 +29,8 @@ public void singleSearch() { @Test public void batchSearch() { - var pageUrl = searchUtil.buildStartingSearchUrl("java"); - var searchResult = searchUtil.batchSearch(pageUrl, 2); + var pageUrl = crawler.buildStartingSearchUrl("java"); + var searchResult = crawler.batchSearch(pageUrl, 2); assertThat(searchResult.size(), is(2 * RECORDS_PER_PAGE)); }