-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from andrei-punko/extract-common-crawler-functi…
…onality-into-separate-module Extract common crawler functionality into separate module
- Loading branch information
Showing
21 changed files
with
405 additions
and
211 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>by.andd3dfx</groupId> | ||
<artifactId>java-crawlers</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
</parent> | ||
|
||
<artifactId>crawler-engine</artifactId> | ||
|
||
<properties> | ||
<maven.compiler.source>21</maven.compiler.source> | ||
<maven.compiler.target>21</maven.compiler.target> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
<lombok.version>1.18.34</lombok.version> | ||
<slf4j.version>2.0.10</slf4j.version> | ||
<log4j-core.version>2.21.1</log4j-core.version> | ||
</properties> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.projectlombok</groupId> | ||
<artifactId>lombok</artifactId> | ||
<version>${lombok.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.jsoup</groupId> | ||
<artifactId>jsoup</artifactId> | ||
<version>1.17.2</version> | ||
</dependency> | ||
|
||
<!-- Logging --> | ||
<dependency> | ||
<groupId>org.slf4j</groupId> | ||
<artifactId>slf4j-api</artifactId> | ||
<version>${slf4j.version}</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>org.slf4j</groupId> | ||
<artifactId>slf4j-reload4j</artifactId> | ||
<version>${slf4j.version}</version> | ||
</dependency> | ||
</dependencies> | ||
|
||
<build> | ||
<plugins> | ||
<plugin> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<version>3.13.0</version> | ||
<configuration> | ||
<annotationProcessorPaths> | ||
<path> | ||
<groupId>org.projectlombok</groupId> | ||
<artifactId>lombok</artifactId> | ||
<version>${lombok.version}</version> | ||
</path> | ||
</annotationProcessorPaths> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</project> |
7 changes: 7 additions & 0 deletions
7
crawler-engine/src/main/java/by/andd3dfx/crawler/dto/CrawlerData.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package by.andd3dfx.crawler.dto; | ||
|
||
/** | ||
* Marker interface for retrieved data | ||
*/ | ||
public interface CrawlerData { | ||
} |
9 changes: 5 additions & 4 deletions
9
...dfx/pravtor/model/SingleSearchResult.java → ...d3dfx/crawler/dto/SingleSearchResult.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,15 @@ | ||
package by.andd3dfx.pravtor.model; | ||
package by.andd3dfx.crawler.dto; | ||
|
||
import java.util.List; | ||
import lombok.Getter; | ||
import lombok.RequiredArgsConstructor; | ||
|
||
import java.util.List; | ||
|
||
@Getter | ||
@RequiredArgsConstructor | ||
public class SingleSearchResult { | ||
public class SingleSearchResult<T extends CrawlerData> { | ||
|
||
private final List<TorrentData> dataItems; | ||
private final List<T> dataItems; | ||
private final String prevPageUrl; | ||
private final String nextPageUrl; | ||
} |
114 changes: 114 additions & 0 deletions
114
crawler-engine/src/main/java/by/andd3dfx/crawler/engine/WebCrawler.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package by.andd3dfx.crawler.engine; | ||
|
||
import by.andd3dfx.crawler.dto.CrawlerData; | ||
import by.andd3dfx.crawler.dto.SingleSearchResult; | ||
import lombok.SneakyThrows; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* Web crawler for retrieving list of data with type T by consequent visiting of pages. | ||
* Process started using provided starting page URL; next link on each page retrieved from current page. | ||
* | ||
* @param <T> data type | ||
*/ | ||
@Slf4j | ||
public abstract class WebCrawler<T extends CrawlerData> { | ||
|
||
protected static final String USER_AGENT = "Mozilla"; | ||
|
||
/** | ||
* Batch search using provided starting page URL, max pages cap 10 and throttling delay 20ms | ||
* | ||
* @param pageUrl starting page URL | ||
* @return list of retrieved items | ||
*/ | ||
public List<T> batchSearch(String pageUrl) { | ||
return batchSearch(pageUrl, 10, 20); | ||
} | ||
|
||
/** | ||
* Batch search using provided starting page URL and max pages cap. Used throttling delay is 20ms. | ||
* Use value -1 for max pages cap to visit all available pages. | ||
* | ||
* @param pageUrl starting page URL | ||
* @param maxPagesCap max pages amount (search will be stopped when this amount of pages requested or no more pages available) | ||
* @return list of retrieved items | ||
*/ | ||
public List<T> batchSearch(String pageUrl, int maxPagesCap) { | ||
return batchSearch(pageUrl, maxPagesCap, 20); | ||
} | ||
|
||
/** | ||
* Batch search using provided starting page URL, max pages cap and throttling delay. | ||
* Use value -1 for max pages cap to visit all available pages. | ||
* | ||
* @param pageUrl starting page URL | ||
* @param maxPagesCap max pages amount (search will be stopped when this amount of pages requested or no more pages available) | ||
* @param throttlingDelayMs delay between two consequent page requests, milliseconds | ||
* @return list of retrieved items | ||
*/ | ||
@SneakyThrows | ||
public List<T> batchSearch(String pageUrl, int maxPagesCap, long throttlingDelayMs) { | ||
assert (throttlingDelayMs > 0); | ||
log.info("Batch search. Starting URL={}, maxPagesCap={}, delay={}ms", pageUrl, maxPagesCap, throttlingDelayMs); | ||
|
||
int pagesCounter = 0; | ||
var nextPage = pageUrl; | ||
List<T> result = new ArrayList<>(); | ||
|
||
while (nextPage != null && (maxPagesCap == -1 || pagesCounter < maxPagesCap)) { | ||
SingleSearchResult<T> searchResult = singleSearch(nextPage); | ||
List<T> dataItems = searchResult.getDataItems(); | ||
log.info("Hit №{}, {} items retrieved", pagesCounter, dataItems.size()); | ||
pagesCounter++; | ||
result.addAll(dataItems); | ||
nextPage = searchResult.getNextPageUrl(); | ||
|
||
Thread.sleep(throttlingDelayMs); | ||
} | ||
log.info("Total records retrieved: {}", result.size()); | ||
|
||
return result; | ||
} | ||
|
||
/** | ||
* Search and extract data from page with provided URL | ||
* | ||
* @param pageUrl URL of page | ||
* @return search result | ||
*/ | ||
@SneakyThrows | ||
public SingleSearchResult<T> singleSearch(String pageUrl) { | ||
Document document = Jsoup | ||
.connect(pageUrl) | ||
.userAgent(USER_AGENT).get(); | ||
|
||
Elements elements = extractElements(document); | ||
|
||
List<T> dataItems = elements.parallelStream() | ||
.map(this::mapElementToData) | ||
.toList(); | ||
log.debug("Single search: url={}, items={}", pageUrl, dataItems.size()); | ||
|
||
String prevUrl = extractPrevUrl(document); | ||
String nextUrl = extractNextUrl(document); | ||
return new SingleSearchResult(dataItems, prevUrl, nextUrl); | ||
} | ||
|
||
protected abstract Elements extractElements(Document document); | ||
|
||
protected String extractPrevUrl(Document document) { | ||
return null; | ||
} | ||
|
||
protected abstract String extractNextUrl(Document document); | ||
|
||
protected abstract T mapElementToData(Element element); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# Root Logger | ||
log4j.rootLogger=INFO, stdout | ||
|
||
# Direct log messages to stdout | ||
log4j.appender.stdout=org.apache.log4j.ConsoleAppender | ||
log4j.appender.stdout.Target=System.out | ||
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout | ||
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!-- Extra logging related to initialization of Log4j. | ||
Set to debug or trace if log4j initialization is failing. --> | ||
<Configuration status="WARN"> | ||
<Appenders> | ||
<Console name="Console" target="SYSTEM_OUT"> | ||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/> | ||
</Console> | ||
</Appenders> | ||
<Loggers> | ||
<!-- | ||
<Logger name="<your_package_name>.<your_class_name>" level="debug"> | ||
<AppenderRef ref="Console"/> | ||
</Logger> | ||
--> | ||
<Root level="INFO"> | ||
<AppenderRef ref="Console"/> | ||
</Root> | ||
</Loggers> | ||
</Configuration> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 2 additions & 1 deletion
3
pravtor.ru-crawler/src/main/java/by/andd3dfx/pravtor/model/TorrentData.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.