Skip to content

Commit

Permalink
Refactoring: use records, remove/inline extra methods, remove extra t…
Browse files Browse the repository at this point in the history
…ry/catch
  • Loading branch information
andrei-punko committed Nov 4, 2024
1 parent 8b2cf03 commit b3e22ef
Show file tree
Hide file tree
Showing 13 changed files with 52 additions and 85 deletions.
Original file line number Diff line number Diff line change
@@ -1,15 +1,6 @@
package by.andd3dfx.crawler.dto;

import lombok.Getter;
import lombok.RequiredArgsConstructor;

import java.util.List;

@Getter
@RequiredArgsConstructor
public class SingleSearchResult<T extends CrawlerData> {

private final List<T> dataItems;
private final String prevPageUrl;
private final String nextPageUrl;
public record SingleSearchResult<T extends CrawlerData>(List<T> dataItems, String nextPageUrl) {
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,11 @@ public List<T> batchSearch(String pageUrl, int maxPagesCap, long throttlingDelay

while (nextPage != null && (maxPagesCap == -1 || pagesCounter < maxPagesCap)) {
SingleSearchResult<T> searchResult = singleSearch(nextPage);
List<T> dataItems = searchResult.getDataItems();
List<T> dataItems = searchResult.dataItems();
log.info("Hit №{}, {} items retrieved", pagesCounter, dataItems.size());
pagesCounter++;
result.addAll(dataItems);
nextPage = searchResult.getNextPageUrl();
nextPage = searchResult.nextPageUrl();

Thread.sleep(throttlingDelayMs);
}
Expand Down Expand Up @@ -97,17 +97,12 @@ public SingleSearchResult<T> singleSearch(String pageUrl) {
.toList();
log.debug("Single search: url={}, items={}", pageUrl, dataItems.size());

String prevUrl = extractPrevUrl(document);
String nextUrl = extractNextUrl(document);
return new SingleSearchResult(dataItems, prevUrl, nextUrl);
return new SingleSearchResult(dataItems, nextUrl);
}

protected abstract Elements extractElements(Document document);

protected String extractPrevUrl(Document document) {
return null;
}

protected abstract String extractNextUrl(Document document);

protected abstract T mapElementToData(Element element);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,20 @@

public class MainApp {

private static final SearchUtil searchUtil = new SearchUtil();;
private static final FileUtil fileUtil = new FileUtil();

public static void main(String[] args) throws IOException {
if (args.length != 2) {
throw new IllegalArgumentException("Should be 2 parameters!");
throw new IllegalArgumentException("Two 2 params should be provided: paramsFileName & excelFileName!");
}
String paramsFileName = args[0];
String excelFileName = args[1];

var searchUtil = new SearchUtil();
var fileUtil = new FileUtil();

var searchItems = new ArrayList<BatchSearchResult>();
for (var searchCriteria : fileUtil.loadSearchCriteria(paramsFileName)) {
String startingUrl = searchCriteria.getUrl();
String label = searchCriteria.getTopic();
String startingUrl = searchCriteria.url();
String label = searchCriteria.topic();

var result = searchUtil.batchSearch(startingUrl, -1, 20)
.stream()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
package by.andd3dfx.pravtor.dto;

import java.util.List;
import lombok.Getter;
import lombok.RequiredArgsConstructor;

@Getter
@RequiredArgsConstructor
public class BatchSearchResult {
public record BatchSearchResult(String topic, List<TorrentData> dataItems) {

private final String topic;
private final List<TorrentData> dataItems;
}
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
package by.andd3dfx.pravtor.dto;

import lombok.Getter;
import lombok.RequiredArgsConstructor;
public record SearchCriteria(String topic, String url) {

@Getter
@RequiredArgsConstructor
public class SearchCriteria {

private final String topic;
private final String url;
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
import java.nio.file.Paths;
import java.util.List;

/**
* Util to work with files
*/
public class FileUtil {

public static final String[] HEADER_LABELS = {"Название", "Seeds", "Peers", "Скачано", "Размер", "Ссылка"};

/**
* Load list of search criteria items
* Load list of search criteria items from file
*
* @param fileName name of params file
* @return list of search criteria items
Expand All @@ -43,7 +46,7 @@ public List<SearchCriteria> loadSearchCriteria(String fileName) throws IOExcepti
public void writeIntoExcel(String fileName, List<BatchSearchResult> searchItems) throws IOException {
try (var book = new HSSFWorkbook();) {
searchItems.forEach(searchItem -> {
Sheet sheet = book.createSheet(searchItem.getTopic());
Sheet sheet = book.createSheet(searchItem.topic());

populateHeaderLabels(sheet);
populateContent(sheet, searchItem);
Expand All @@ -66,7 +69,7 @@ private void populateHeaderLabels(Sheet sheet) {

private void populateContent(Sheet sheet, BatchSearchResult searchItem) {
int rowsCount = 1;
for (TorrentData dataItem : searchItem.getDataItems()) {
for (TorrentData dataItem : searchItem.dataItems()) {
int column_number = 0;
Row row = sheet.createRow(rowsCount);
row.createCell(column_number++).setCellValue(dataItem.getLabel());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,25 @@
@Slf4j
public class SearchUtil extends WebCrawler<TorrentData> {

private static final String PREFIX = "https://pravtor.ru/";
private static final String BASE_URL = "https://pravtor.ru/";

@Override
protected Elements extractElements(Document document) {
return document.select("tr[id^=tr-]");
}

@Override
protected String extractPrevUrl(Document document) {
return extractPrevOrNext(document, "Пред.");
}

@Override
protected String extractNextUrl(Document document) {
return extractPrevOrNext(document, "След.");
List<Element> pageItems = document
.select("td[class=tRight vBottom nowrap small]")
.select("a").stream()
.filter(s -> s.text().contains("След."))
.toList();

if (pageItems.isEmpty()) {
return null;
}
return BASE_URL + pageItems.get(0).attr("href");
}

@Override
Expand All @@ -45,20 +49,11 @@ protected TorrentData mapElementToData(Element element) {
.build();
}

private String extractPrevOrNext(Document document, String value) {
List<Element> pageItems = document.select("td[class=tRight vBottom nowrap small]")
.select("a").stream()
.filter(s -> s.text().contains(value))
.toList();

if (pageItems.isEmpty()) {
return null;
}
return PREFIX + pageItems.get(0).attr("href");
}

private String extractLink(String href) {
return StringUtils.isEmpty(href) ? href : PREFIX + href.substring(2);
if (StringUtils.isEmpty(href)) {
return href;
}
return BASE_URL + href.substring(2);
}

private Integer convertToInteger(String value) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ private void runMainNCheckExceptionThrow(String[] args) throws Exception {
mainApp.main(args);
fail("Exception should be thrown");
} catch (IllegalArgumentException iae) {
assertThat(iae.getMessage(), is("Should be 2 parameters!"));
assertThat(iae.getMessage(), is("Two 2 params should be provided: paramsFileName & excelFileName!"));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ public void loadSearchCriteria() throws IOException {

assertThat("Wrong count of criteria items", criteriaItems.size(), is(2));
var item0 = criteriaItems.get(0);
assertThat("Wrong url of first item", item0.getTopic(), is("txt-molitvy"));
assertThat("Wrong label of first item", item0.getUrl(), is("https://pravtor.ru/viewforum.php?f=184"));
assertThat("Wrong url of first item", item0.topic(), is("txt-molitvy"));
assertThat("Wrong label of first item", item0.url(), is("https://pravtor.ru/viewforum.php?f=184"));
var item1 = criteriaItems.get(1);
assertThat("Wrong label of second item", item1.getTopic(), is("txt-kanony"));
assertThat("Wrong url of second item", item1.getUrl(), is("https://pravtor.ru/viewforum.php?f=183"));
assertThat("Wrong label of second item", item1.topic(), is("txt-kanony"));
assertThat("Wrong url of second item", item1.url(), is("https://pravtor.ru/viewforum.php?f=183"));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ public void batchSearch() {
public void singleSearch() {
var result = searchUtil.singleSearch(STARTING_URL);

assertThat("Wrong amount of result records", result.getDataItems().size(), is(50));
assertThat("Wrong amount of result records", result.dataItems().size(), is(50));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public static void main(String[] args) throws IOException {
}

var searchUtil = new SearchUtil();
var pageUrl = searchUtil.buildSearchUrl("java");
var pageUrl = searchUtil.buildStartingSearchUrl("java");
var searchResult = searchUtil.batchSearch(pageUrl);

var statisticsSortedMap = new StatisticsUtil().collectStatistics(searchResult);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,24 @@

import by.andd3dfx.crawler.engine.WebCrawler;
import by.andd3dfx.rabotaby.dto.VacancyData;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.Arrays;
import java.util.stream.Collectors;

@Slf4j
public class SearchUtil extends WebCrawler<VacancyData> {

private final String URL_PREFIX = "http://rabota.by";
private final String searchUrlFormat = URL_PREFIX + "/search/vacancy?area=1002&text=%s&page=%d";
private final String BASE_URL = "http://rabota.by";

public String buildSearchUrl(String searchString) {
return String.format(searchUrlFormat, searchString, 0);
public String buildStartingSearchUrl(String searchString) {
final var format = BASE_URL + "/search/vacancy?area=1002&text=%s&page=%d";
return String.format(format, searchString, 0);
}

@Override
Expand All @@ -33,21 +33,17 @@ protected String extractNextUrl(Document document) {
if (nextPageItem.isEmpty()) {
return null;
}
return URL_PREFIX + nextPageItem.attr("href");
return BASE_URL + nextPageItem.attr("href");
}

@SneakyThrows
@Override
protected VacancyData mapElementToData(Element element) {
String searchUrl = element.select("a").attr("href");
log.info("Retrieve vacancy details for {}", searchUrl);
Document document;
try {
document = Jsoup
Document document = Jsoup
.connect(searchUrl)
.userAgent(USER_AGENT).get();
} catch (IOException e) {
throw new RuntimeException("Retrieve details failed", e);
}

return VacancyData.builder()
.url(document.baseUri())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ public void setup() {

@Test
public void singleSearch() {
var pageUrl = searchUtil.buildSearchUrl("java");
var pageUrl = searchUtil.buildStartingSearchUrl("java");
var result = searchUtil.singleSearch(pageUrl);

assertThat("Next url should be present", result.getNextPageUrl(), is(
assertThat("Next url should be present", result.nextPageUrl(), is(
"http://rabota.by/search/vacancy?area=1002&text=java&page=1&hhtmFrom=vacancy_search_list"));
assertThat("At least 20 items expected", result.getDataItems().size(), greaterThanOrEqualTo(RECORDS_PER_PAGE));
assertThat("At least 20 items expected", result.dataItems().size(), greaterThanOrEqualTo(RECORDS_PER_PAGE));
}

@Test
public void batchSearch() {
var pageUrl = searchUtil.buildSearchUrl("java");
var pageUrl = searchUtil.buildStartingSearchUrl("java");
var searchResult = searchUtil.batchSearch(pageUrl, 2);

assertThat(searchResult.size(), is(2 * RECORDS_PER_PAGE));
Expand Down

0 comments on commit b3e22ef

Please sign in to comment.