Skip to content

Commit

Permalink
Reorder methods and add some javadocs
Browse files Browse the repository at this point in the history
  • Loading branch information
andrei-punko committed Nov 30, 2024
1 parent 30c9a25 commit 1e1f139
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,12 @@

import java.util.List;

/**
* DTO to store results of single search
*
* @param dataItems list of data items with type T
* @param nextPageUrl URL of next page
* @param <T> data items type
*/
public record SingleSearchResult<T extends CrawlerData>(List<T> dataItems, String nextPageUrl) {
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
* Web crawler for retrieving list of data with type T by consequent visiting of pages.
* Process started using provided starting page URL; next link on each page retrieved from current page.
*
* @param <T> data type
* @param <T> data items type
*/
@Slf4j
public abstract class WebCrawler<T extends CrawlerData> {
Expand All @@ -25,6 +25,53 @@ public abstract class WebCrawler<T extends CrawlerData> {
private static final int DEFAULT_MAX_PAGES_CAP = 10;
private static final long DEFAULT_THROTTLING_DELAY_MS = 20;

/**
* Search and extract data from page with provided URL
*
* @param pageUrl URL of page
* @return search result
*/
@SneakyThrows
public SingleSearchResult<T> singleSearch(String pageUrl) {
Document document = Jsoup
.connect(pageUrl)
.userAgent(USER_AGENT).get();

Elements elements = extractElements(document);

List<T> dataItems = elements.stream()
.map(this::mapElementToData)
.toList();
log.debug("Single search: url={}, items={}", pageUrl, dataItems.size());

String nextUrl = extractNextUrl(document);
return new SingleSearchResult<>(dataItems, nextUrl);
}

/**
* Extract elements from parsed Jsoup document
*
* @param document Jsoup document
* @return extracted elements
*/
protected abstract Elements extractElements(Document document);

/**
* Extract next URL from parsed Jsoup document
*
* @param document Jsoup document
* @return next URL
*/
protected abstract String extractNextUrl(Document document);

/**
* Map element to result DTO object
*
* @param element
* @return DTO object of type T
*/
protected abstract T mapElementToData(Element element);

/**
* Batch search using provided starting page URL, max pages cap 10 and throttling delay 20ms
*
Expand Down Expand Up @@ -79,33 +126,4 @@ public List<T> batchSearch(String pageUrl, int maxPagesCap, long throttlingDelay

return result;
}

/**
* Search and extract data from page with provided URL
*
* @param pageUrl URL of page
* @return search result
*/
@SneakyThrows
public SingleSearchResult<T> singleSearch(String pageUrl) {
Document document = Jsoup
.connect(pageUrl)
.userAgent(USER_AGENT).get();

Elements elements = extractElements(document);

List<T> dataItems = elements.stream()
.map(this::mapElementToData)
.toList();
log.debug("Single search: url={}, items={}", pageUrl, dataItems.size());

String nextUrl = extractNextUrl(document);
return new SingleSearchResult(dataItems, nextUrl);
}

protected abstract Elements extractElements(Document document);

protected abstract String extractNextUrl(Document document);

protected abstract T mapElementToData(Element element);
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ public static void main(String[] args) throws IOException {
var searchItems = new ArrayList<BatchSearchResult>();
for (var searchCriteria : fileUtil.loadSearchCriteria(paramsFileName)) {
String startingUrl = searchCriteria.url();
String label = searchCriteria.topic();
String topic = searchCriteria.topic();

var result = crawler.batchSearch(startingUrl, -1, 20)
.stream()
.filter(torrentData -> torrentData.getDownloadedCount() != null)
.sorted(Comparator.comparingInt(TorrentData::getDownloadedCount).reversed())
.toList();
searchItems.add(new BatchSearchResult(label, result));
searchItems.add(new BatchSearchResult(topic, result));
}

fileUtil.writeIntoExcel(excelFileName, searchItems);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

import java.util.List;

/**
* DTO to store results of batch search
*
* @param topic name of topic
* @param dataItems data items
*/
public record BatchSearchResult(String topic, List<TorrentData> dataItems) {

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
package by.andd3dfx.pravtor.dto;

/**
* Search criteria
*
* @param topic name of topic
* @param url starting page URL
*/
public record SearchCriteria(String topic, String url) {

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
import java.util.List;

/**
* Util to work with files
* Util to work with files of types: text & excel
*/
public class FileUtil {

public static final String[] HEADER_LABELS = {"Название", "Seeds", "Peers", "Скачано", "Размер", "Ссылка"};

/**
* Load list of search criteria items from file
* Load list of search criteria items from plain text file
*
* @param fileName name of params file
* @return list of search criteria items
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,6 @@ protected VacancyData mapElementToData(Element element) {
.build();
}

private String extractSalary(Document document) {
return StringUtils.trimToNull(
document.select("span[data-qa=vacancy-salary-compensation-type-net]").text()
);
}

private static String extractCompanyName(Document document) {
var elements = document.select("a[data-qa=vacancy-company-name]");
if (elements.isEmpty()) {
Expand All @@ -80,6 +74,12 @@ private static String extractTextContent(Document document) {
return document.select("div[data-qa=vacancy-description]").text();
}

private String extractSalary(Document document) {
return StringUtils.trimToNull(
document.select("span[data-qa=vacancy-salary-compensation-type-net]").text()
);
}

private static Set<String> extractKeywords(Document document) {
return document.select("li[data-qa=skills-element]")
.stream()
Expand Down

0 comments on commit 1e1f139

Please sign in to comment.