Skip to content

Commit

Permalink
Merge pull request #5 from andrei-punko/extract-common-crawler-functi…
Browse files Browse the repository at this point in the history
…onality-into-separate-module

Extract common crawler functionality into separate module
  • Loading branch information
andrei-punko authored Nov 2, 2024
2 parents f86806b + d554108 commit f587799
Show file tree
Hide file tree
Showing 21 changed files with 405 additions and 211 deletions.
65 changes: 65 additions & 0 deletions crawler-engine/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>by.andd3dfx</groupId>
<artifactId>java-crawlers</artifactId>
<version>1.0-SNAPSHOT</version>
</parent>

<artifactId>crawler-engine</artifactId>

<properties>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lombok.version>1.18.34</lombok.version>
<slf4j.version>2.0.10</slf4j.version>
<log4j-core.version>2.21.1</log4j-core.version>
</properties>

<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>

<!-- Logging -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-reload4j</artifactId>
<version>${slf4j.version}</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.13.0</version>
<configuration>
<annotationProcessorPaths>
<path>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
</path>
</annotationProcessorPaths>
</configuration>
</plugin>
</plugins>
</build>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package by.andd3dfx.crawler.dto;

/**
* Marker interface for retrieved data
*/
public interface CrawlerData {
}
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
package by.andd3dfx.pravtor.model;
package by.andd3dfx.crawler.dto;

import java.util.List;
import lombok.Getter;
import lombok.RequiredArgsConstructor;

import java.util.List;

@Getter
@RequiredArgsConstructor
public class SingleSearchResult {
public class SingleSearchResult<T extends CrawlerData> {

private final List<TorrentData> dataItems;
private final List<T> dataItems;
private final String prevPageUrl;
private final String nextPageUrl;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package by.andd3dfx.crawler.engine;

import by.andd3dfx.crawler.dto.CrawlerData;
import by.andd3dfx.crawler.dto.SingleSearchResult;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.ArrayList;
import java.util.List;

/**
* Web crawler for retrieving list of data with type T by consequent visiting of pages.
* Process started using provided starting page URL; next link on each page retrieved from current page.
*
* @param <T> data type
*/
@Slf4j
public abstract class WebCrawler<T extends CrawlerData> {

protected static final String USER_AGENT = "Mozilla";

/**
* Batch search using provided starting page URL, max pages cap 10 and throttling delay 20ms
*
* @param pageUrl starting page URL
* @return list of retrieved items
*/
public List<T> batchSearch(String pageUrl) {
return batchSearch(pageUrl, 10, 20);
}

/**
* Batch search using provided starting page URL and max pages cap. Used throttling delay is 20ms.
* Use value -1 for max pages cap to visit all available pages.
*
* @param pageUrl starting page URL
* @param maxPagesCap max pages amount (search will be stopped when this amount of pages requested or no more pages available)
* @return list of retrieved items
*/
public List<T> batchSearch(String pageUrl, int maxPagesCap) {
return batchSearch(pageUrl, maxPagesCap, 20);
}

/**
* Batch search using provided starting page URL, max pages cap and throttling delay.
* Use value -1 for max pages cap to visit all available pages.
*
* @param pageUrl starting page URL
* @param maxPagesCap max pages amount (search will be stopped when this amount of pages requested or no more pages available)
* @param throttlingDelayMs delay between two consequent page requests, milliseconds
* @return list of retrieved items
*/
@SneakyThrows
public List<T> batchSearch(String pageUrl, int maxPagesCap, long throttlingDelayMs) {
assert (throttlingDelayMs > 0);
log.info("Batch search. Starting URL={}, maxPagesCap={}, delay={}ms", pageUrl, maxPagesCap, throttlingDelayMs);

int pagesCounter = 0;
var nextPage = pageUrl;
List<T> result = new ArrayList<>();

while (nextPage != null && (maxPagesCap == -1 || pagesCounter < maxPagesCap)) {
SingleSearchResult<T> searchResult = singleSearch(nextPage);
List<T> dataItems = searchResult.getDataItems();
log.info("Hit №{}, {} items retrieved", pagesCounter, dataItems.size());
pagesCounter++;
result.addAll(dataItems);
nextPage = searchResult.getNextPageUrl();

Thread.sleep(throttlingDelayMs);
}
log.info("Total records retrieved: {}", result.size());

return result;
}

/**
* Search and extract data from page with provided URL
*
* @param pageUrl URL of page
* @return search result
*/
@SneakyThrows
public SingleSearchResult<T> singleSearch(String pageUrl) {
Document document = Jsoup
.connect(pageUrl)
.userAgent(USER_AGENT).get();

Elements elements = extractElements(document);

List<T> dataItems = elements.parallelStream()
.map(this::mapElementToData)
.toList();
log.debug("Single search: url={}, items={}", pageUrl, dataItems.size());

String prevUrl = extractPrevUrl(document);
String nextUrl = extractNextUrl(document);
return new SingleSearchResult(dataItems, prevUrl, nextUrl);
}

protected abstract Elements extractElements(Document document);

protected String extractPrevUrl(Document document) {
return null;
}

protected abstract String extractNextUrl(Document document);

protected abstract T mapElementToData(Element element);
}
8 changes: 8 additions & 0 deletions crawler-engine/src/main/resources/log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Root Logger
log4j.rootLogger=INFO, stdout

# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
20 changes: 20 additions & 0 deletions crawler-engine/src/main/resources/log4j2.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Extra logging related to initialization of Log4j.
Set to debug or trace if log4j initialization is failing. -->
<Configuration status="WARN">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<!--
<Logger name="<your_package_name>.<your_class_name>" level="debug">
<AppenderRef ref="Console"/>
</Logger>
-->
<Root level="INFO">
<AppenderRef ref="Console"/>
</Root>
</Loggers>
</Configuration>
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
</properties>

<modules>
<module>crawler-engine</module>
<module>pravtor.ru-crawler</module>
<module>rabota.by-crawler</module>
</modules>
Expand Down
5 changes: 5 additions & 0 deletions pravtor.ru-crawler/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
</properties>

<dependencies>
<dependency>
<groupId>by.andd3dfx</groupId>
<artifactId>crawler-engine</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
package by.andd3dfx.pravtor.model;

import by.andd3dfx.crawler.dto.CrawlerData;
import lombok.Builder;
import lombok.Data;

@Builder
@Data
public class TorrentData {
public class TorrentData implements CrawlerData {

private String label;
private String linkUrl;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,86 +1,48 @@
package by.andd3dfx.pravtor.util;

import by.andd3dfx.pravtor.model.SingleSearchResult;
import by.andd3dfx.crawler.engine.WebCrawler;
import by.andd3dfx.pravtor.model.TorrentData;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import static java.lang.Thread.sleep;

/**
* Util to perform search on <a href="http://pravtor.ru">pravtor.ru</a> torrent tracker
*/
@Slf4j
public class SearchUtil {
public class SearchUtil extends WebCrawler<TorrentData> {

private static final String USER_AGENT = "Mozilla";
private static final String PREFIX = "https://pravtor.ru/";

public List<TorrentData> batchSearch(String startingPageUrl, int maxPagesCap, long throttlingDelay)
throws InterruptedException, IOException {

log.info("Starting URL: {}, maxPagesCap={}, delay={}ms", startingPageUrl, maxPagesCap, throttlingDelay);

String nextPageUrl = startingPageUrl;
int pagesCounter = 0;
List<TorrentData> result = new ArrayList<>();

while (nextPageUrl != null && (maxPagesCap == -1 || pagesCounter < maxPagesCap)) {

SingleSearchResult singleSearchResult = singleSearch(nextPageUrl);
log.info("Hit {}, {} retrieved", pagesCounter, singleSearchResult.getDataItems().size());
pagesCounter++;
nextPageUrl = singleSearchResult.getNextPageUrl();
result.addAll(singleSearchResult.getDataItems());

sleep(throttlingDelay);
}
log.info("Records retrieved: {}", result.size());

return result;
@Override
protected Elements extractElements(Document document) {
return document.select("tr[id^=tr-]");
}

SingleSearchResult singleSearch(String startingPageUrl) throws IOException {
Document document = Jsoup
.connect(startingPageUrl)
.userAgent(USER_AGENT).get();

Elements elements = document.select("tr[id^=tr-]");

List<TorrentData> dataItems = elements.stream()
.map(element -> TorrentData.builder()
.label(element.select("div[class=torTopic]").select("a").text())
.linkUrl(extractLink(element.select("a[class=torTopic]").attr("href")))
.seedsCount(convertToInteger(element.select("span[title=Seeders]").text()))
.peersCount(convertToInteger(element.select("span[title=Leechers]").text()))
.size(element.select("div[title=Скачать .torrent]").select("div[class=small]").text())
.downloadedCount(convertToInteger(element.select("p[title=Скачан]").text()))
.build()
).toList();

String prevUrl = extractPrevOrNext(document, "Пред.");
String nextUrl = extractPrevOrNext(document, "След.");
return new SingleSearchResult(dataItems, prevUrl, nextUrl);
@Override
protected String extractPrevUrl(Document document) {
return extractPrevOrNext(document, "Пред.");
}

private String extractLink(String href) {
return StringUtils.isEmpty(href) ? href : PREFIX + href.substring(2);
@Override
protected String extractNextUrl(Document document) {
return extractPrevOrNext(document, "След.");
}

private Integer convertToInteger(String value) {
if (!StringUtils.isNumeric(value)) {
return null;
}
return Integer.parseInt(value);
@Override
protected TorrentData mapElementToData(Element element) {
return TorrentData.builder()
.label(element.select("div[class=torTopic]").select("a").text())
.linkUrl(extractLink(element.select("a[class=torTopic]").attr("href")))
.seedsCount(convertToInteger(element.select("span[title=Seeders]").text()))
.peersCount(convertToInteger(element.select("span[title=Leechers]").text()))
.size(element.select("div[title=Скачать .torrent]").select("div[class=small]").text())
.downloadedCount(convertToInteger(element.select("p[title=Скачан]").text()))
.build();
}

private String extractPrevOrNext(Document document, String value) {
Expand All @@ -94,4 +56,15 @@ private String extractPrevOrNext(Document document, String value) {
}
return PREFIX + pageItems.get(0).attr("href");
}

private String extractLink(String href) {
return StringUtils.isEmpty(href) ? href : PREFIX + href.substring(2);
}

private Integer convertToInteger(String value) {
if (!StringUtils.isNumeric(value)) {
return null;
}
return Integer.parseInt(value);
}
}
Loading

0 comments on commit f587799

Please sign in to comment.