Skip to content

Commit

Permalink
Merge pull request #7 from andrei-punko/6-fix-error-during-determinin…
Browse files Browse the repository at this point in the history
…g-next-url-for-rabota.by

Fix #6 error during determining next url for rabota.by crawler
  • Loading branch information
andrei-punko authored Nov 27, 2024
2 parents 49e6aed + d6d3461 commit 6b784da
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,13 @@ protected Elements extractElements(Document document) {

@Override
protected String extractNextUrl(Document document) {
Elements nextPageItem = document.select("a[data-qa=pager-next]");
if (nextPageItem.isEmpty()) {
return null;
Elements pagesATags = document.select("a[data-qa=pager-page]");
Element currentPage = pagesATags.select("[aria-current=true]").getFirst();
int nextIndex = pagesATags.indexOf(currentPage) + 1;
if (nextIndex < pagesATags.size()) {
return BASE_URL + pagesATags.get(nextIndex).attr("href");
}
return BASE_URL + nextPageItem.attr("href");
return null;
}

@SneakyThrows
Expand All @@ -47,8 +49,8 @@ protected VacancyData mapElementToData(Element element) {
String searchUrl = element.select("a").attr("href");
log.info("Retrieve vacancy details for {}", searchUrl);
Document document = Jsoup
.connect(searchUrl)
.userAgent(USER_AGENT).get();
.connect(searchUrl)
.userAgent(USER_AGENT).get();

return VacancyData.builder()
.url(document.baseUri())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import org.junit.Before;
import org.junit.Test;

import static org.hamcrest.CoreMatchers.endsWith;
import static org.hamcrest.CoreMatchers.is;
import static org.hamcrest.CoreMatchers.startsWith;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;

Expand All @@ -22,12 +24,13 @@ public void singleSearch() {
var pageUrl = crawler.buildStartingSearchUrl("java");
var result = crawler.singleSearch(pageUrl);

assertThat("Next url should be present", result.nextPageUrl(), is(
assertThat(result.nextPageUrl(), startsWith(
"""
http://rabota.by/search/vacancy?area=1002&text=java&page=1\
&hhtmFromLabel=vacancy_search_line\
&hhtmFrom=vacancy_search_list\
&searchSessionId=\
"""));
assertThat(result.nextPageUrl(), endsWith("&hhtmFrom=vacancy_search_list"));
assertThat("At least 20 items expected", result.dataItems().size(), greaterThanOrEqualTo(RECORDS_PER_PAGE));
}

Expand Down

0 comments on commit 6b784da

Please sign in to comment.