Skip to content

Commit

Permalink
Put crawlers into crawler package & rename them. Adjust paths in RE…
Browse files Browse the repository at this point in the history
…ADME
  • Loading branch information
andrei-punko committed Nov 4, 2024
1 parent b3e22ef commit e645375
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 37 deletions.
25 changes: 15 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,32 +1,37 @@

# Collection of Java-based web crawlers

[![Java CI with Maven](https://github.com/andrei-punko/java-crawlers/actions/workflows/maven.yml/badge.svg)](https://github.com/andrei-punko/java-crawlers/actions/workflows/maven.yml)

## Prerequisites

- Maven 3
- JDK 21

## How to build

```
mvn clean install
```

## Common crawler functionality
- Your crawler should extend [WebCrawler](crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java)
base crawler class
- DTO class which describes collected data should implement
[CrawlerData](crawler-engine/src/main/java/by/andd3dfx/crawler/dto/CrawlerData.java) marker interface

- Your crawler should extend [WebCrawler](crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java)
base crawler class
- DTO class which describes collected data should implement
[CrawlerData](crawler-engine/src/main/java/by/andd3dfx/crawler/dto/CrawlerData.java) marker interface

## Crawler for Orthodox torrent tracker [pravtor.ru](http://pravtor.ru)
Check [SearchUtil](pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/util/SearchUtil.java)

Check [PravtorWebCrawler](pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/crawler/PravtorWebCrawler.java)
in `pravtor.ru-crawler` module for details

To make search - run [run-search.bat](pravtor.ru-crawler/run-search.bat) script.
To make search - use [run-search.bat](pravtor.ru-crawler/run-search.bat) script.
Collected data will be placed into [result.xls](pravtor.ru-crawler/sandbox/result.xls) file in `sandbox` folder

## Crawler for vacancies aggregator [rabota.by / hh.ru](http://rabota.by)
Check [SearchUtil](rabota.by-crawler/src/main/java/by/andd3dfx/sitesparsing/rabotaby/SearchUtil.java)
## Crawler for vacancies aggregator [rabota.by](http://rabota.by) (it's localized version of [hh.ru](http://hh.ru) in Belarus)

Check [RabotabyWebCrawler](rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/crawler/RabotabyWebCrawler.java)
in `rabota.by-crawler` module for details

To make search - run `main()` method of [MainApp](rabota.by-crawler/src/main/java/by/andd3dfx/sitesparsing/rabotaby/MainApp.java)
To make search - run `main()` method of [MainApp](rabota.by-crawler/src/main/java/by/andd3dfx/rabotaby/MainApp.java)
class with populated output path in command line param
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
import by.andd3dfx.pravtor.dto.BatchSearchResult;
import by.andd3dfx.pravtor.dto.TorrentData;
import by.andd3dfx.pravtor.util.FileUtil;
import by.andd3dfx.pravtor.util.SearchUtil;
import by.andd3dfx.pravtor.crawler.PravtorWebCrawler;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;

public class MainApp {

private static final SearchUtil searchUtil = new SearchUtil();;
private static final PravtorWebCrawler crawler = new PravtorWebCrawler();;
private static final FileUtil fileUtil = new FileUtil();

public static void main(String[] args) throws IOException {
Expand All @@ -26,7 +26,7 @@ public static void main(String[] args) throws IOException {
String startingUrl = searchCriteria.url();
String label = searchCriteria.topic();

var result = searchUtil.batchSearch(startingUrl, -1, 20)
var result = crawler.batchSearch(startingUrl, -1, 20)
.stream()
.filter(torrentData -> torrentData.getDownloadedCount() != null)
.sorted(Comparator.comparingInt(TorrentData::getDownloadedCount).reversed())
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package by.andd3dfx.pravtor.util;
package by.andd3dfx.pravtor.crawler;

import by.andd3dfx.crawler.engine.WebCrawler;
import by.andd3dfx.pravtor.dto.TorrentData;
Expand All @@ -14,7 +14,7 @@
* Util to perform search on <a href="http://pravtor.ru">pravtor.ru</a> torrent tracker
*/
@Slf4j
public class SearchUtil extends WebCrawler<TorrentData> {
public class PravtorWebCrawler extends WebCrawler<TorrentData> {

private static final String BASE_URL = "https://pravtor.ru/";

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package by.andd3dfx.pravtor.util;
package by.andd3dfx.pravtor.crawler;

import by.andd3dfx.pravtor.dto.TorrentData;
import org.junit.Before;
Expand All @@ -9,27 +9,27 @@
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.MatcherAssert.assertThat;

public class SearchUtilTest {
public class PravtorWebCrawlerTest {

private final String STARTING_URL = "https://pravtor.ru/viewforum.php?f=28"; // Святоотеческие тексты и жития святых

private SearchUtil searchUtil;
private PravtorWebCrawler crawler;

@Before
public void setup() {
searchUtil = new SearchUtil();
crawler = new PravtorWebCrawler();
}

@Test
public void batchSearch() {
List<TorrentData> result = searchUtil.batchSearch(STARTING_URL, 2, 20);
List<TorrentData> result = crawler.batchSearch(STARTING_URL, 2, 20);

assertThat("Wrong amount of result records", result.size(), is(100));
}

@Test
public void singleSearch() {
var result = searchUtil.singleSearch(STARTING_URL);
var result = crawler.singleSearch(STARTING_URL);

assertThat("Wrong amount of result records", result.dataItems().size(), is(50));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package by.andd3dfx.rabotaby;

import by.andd3dfx.rabotaby.util.SearchUtil;
import by.andd3dfx.rabotaby.crawler.RabotabyWebCrawler;
import by.andd3dfx.rabotaby.util.StatisticsUtil;

import java.io.IOException;
Expand All @@ -15,9 +15,9 @@ public static void main(String[] args) throws IOException {
throw new IllegalArgumentException("Path to output file should be populated!");
}

var searchUtil = new SearchUtil();
var pageUrl = searchUtil.buildStartingSearchUrl("java");
var searchResult = searchUtil.batchSearch(pageUrl);
var crawler = new RabotabyWebCrawler();
var pageUrl = crawler.buildStartingSearchUrl("java");
var searchResult = crawler.batchSearch(pageUrl);

var statisticsSortedMap = new StatisticsUtil().collectStatistics(searchResult);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package by.andd3dfx.rabotaby.util;
package by.andd3dfx.rabotaby.crawler;

import by.andd3dfx.crawler.engine.WebCrawler;
import by.andd3dfx.rabotaby.dto.VacancyData;
Expand All @@ -13,7 +13,7 @@
import java.util.stream.Collectors;

@Slf4j
public class SearchUtil extends WebCrawler<VacancyData> {
public class RabotabyWebCrawler extends WebCrawler<VacancyData> {

private final String BASE_URL = "http://rabota.by";

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package by.andd3dfx.rabotaby.util;
package by.andd3dfx.rabotaby.crawler;

import org.junit.Before;
import org.junit.Test;
Expand All @@ -7,22 +7,20 @@
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;

public class SearchUtilTest {
public class RabotabyWebCrawlerTest {

private static final int RECORDS_PER_PAGE = 20;
private SearchUtil searchUtil;
private StatisticsUtil statisticsUtil;
private RabotabyWebCrawler crawler;

@Before
public void setup() {
searchUtil = new SearchUtil();
statisticsUtil = new StatisticsUtil();
crawler = new RabotabyWebCrawler();
}

@Test
public void singleSearch() {
var pageUrl = searchUtil.buildStartingSearchUrl("java");
var result = searchUtil.singleSearch(pageUrl);
var pageUrl = crawler.buildStartingSearchUrl("java");
var result = crawler.singleSearch(pageUrl);

assertThat("Next url should be present", result.nextPageUrl(), is(
"http://rabota.by/search/vacancy?area=1002&text=java&page=1&hhtmFrom=vacancy_search_list"));
Expand All @@ -31,8 +29,8 @@ public void singleSearch() {

@Test
public void batchSearch() {
var pageUrl = searchUtil.buildStartingSearchUrl("java");
var searchResult = searchUtil.batchSearch(pageUrl, 2);
var pageUrl = crawler.buildStartingSearchUrl("java");
var searchResult = crawler.batchSearch(pageUrl, 2);

assertThat(searchResult.size(), is(2 * RECORDS_PER_PAGE));
}
Expand Down

0 comments on commit e645375

Please sign in to comment.