Skip to content

Commit

Permalink
[fix] 정보 잘림 문제 해결 등 크롤링 완료 (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
youngniw committed May 26, 2022
1 parent d5ae801 commit c03a32f
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 115 deletions.
108 changes: 0 additions & 108 deletions src/main/java/com/nadoyagsa/pillaroid/component/JsoupComponent.java

This file was deleted.

114 changes: 114 additions & 0 deletions src/main/java/com/nadoyagsa/pillaroid/component/MedicineCrawlUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package com.nadoyagsa.pillaroid.component;

import com.nadoyagsa.pillaroid.dto.Appearance;
import com.nadoyagsa.pillaroid.dto.Medicine;
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;

import java.io.IOException;

@Component
public class MedicineCrawlUtil {
public Medicine getMedicineInfo(String medicineUrl) {
Connection conn = Jsoup.connect(medicineUrl);
try {
Document document = conn.get();

Elements nameElements = document.getElementsByClass("stress");
if (nameElements.size() > 0) {
Element parent = nameElements.get(0).parent();
// 네이버 의약품 검색 크롤링 중 <p></p>로 인해 잘린 부분이 있을 수 있어서 다음으로 변경
String parentProcessed = parent.html()
.replace("</p>\n<p></p>", "")
.replace("<p></p>", "")
.replaceAll("<p\\s[^>]*>\\s*</p>", ""); //p 태그 중 속성값이 있는데 내용은 없는 컴포넌트 삭제

Document documentProcessed = Jsoup.parse(parentProcessed);

return getMedicineInfo(documentProcessed);
}
return getMedicineInfo(document);
} catch (HttpStatusException e) {
return Medicine.builder().build();
} catch (IOException e) {
e.printStackTrace();
}

return null;
}

public Medicine getMedicineInfo(Document document) { // 지식백과에서의 의약품명 크롤링
Medicine medicine = new Medicine();

Elements nameElements = document.getElementsByClass("stress");
for (Element nameElement : nameElements) { // 외형정보, 성분정보, 저장방법, 효능효과, 용법용량, 사용상 주의사항
String topic = nameElement.text();
Element textElement = nameElement.nextElementSibling();

if (textElement != null) {
if (topic.equals("외형정보")) {
Appearance appearanceInfo = new Appearance();

String[] splitTopic = textElement.html()
.split("<strong>"); // 외형정보 안의 소주제를 나눔

for (String subTopic : splitTopic) {
String subText = subTopic
.replace("<br>", "\n")
.replaceAll("<[^>]*>", "").strip();

String[] information = subText.split(":", 2);

if (information.length <= 1)
continue;

if (information[0].contains("성상"))
appearanceInfo.setAppearance(information[1].trim());
else if (information[0].contains("제형"))
appearanceInfo.setFormulation(information[1].trim());
else if (information[0].contains("모양"))
appearanceInfo.setShape(information[1].trim());
else if (information[0].contains("색상"))
appearanceInfo.setColor(information[1].trim());
else if (information[0].contains("분할선"))
appearanceInfo.setDividingLine(information[1].trim());
else if (information[0].contains("식별표기"))
appearanceInfo.setIdentificationMark(information[1].trim());
}
medicine.setAppearanceInfo(appearanceInfo);
}
else {
String text = textElement.html()
.replace("<br>", "\n") // 줄바꿈 모두 저장
.replaceAll("\\[허가사항변경[^]]*]\n*", "") // [허가사항변경] 관련 불필요 내용 제거
.replaceAll("<[^>]*>", "") // 태그 모두 제거
.strip();

switch (topic) {
case "성분정보":
medicine.setIngredient(text);
break;
case "저장방법":
medicine.setSave(text);
break;
case "효능효과":
medicine.setEfficacy(text);
break;
case "용법용량":
medicine.setUsage(text);
break;
case "사용상 주의사항":
medicine.setPrecautions(text);
break;
}
}
}
}
return medicine;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
@Component
@RequiredArgsConstructor
public class MedicineExcelUtils {
private final JsoupComponent jsoupComponent;
private final MedicineCrawlUtil medicineCrawlUtil;

public void updateMedicineExcel() throws IOException {

Expand Down Expand Up @@ -113,7 +113,7 @@ private HashMap<Integer, String> crawlMedicineInfo(String productLink) {
String detailBaseUrl = "https://terms.naver.com";
String detailUrl = detailBaseUrl + productLink;

Medicine medicineInfo = jsoupComponent.getMedicineInfo(detailUrl);
Medicine medicineInfo = medicineCrawlUtil.getMedicineInfo(detailUrl);

//(수정할 colIdx, content)로 된 hashMap
HashMap<Integer, String> result = new HashMap<>();
Expand Down
25 changes: 20 additions & 5 deletions src/main/java/com/nadoyagsa/pillaroid/service/MedicineService.java
Original file line number Diff line number Diff line change
@@ -1,20 +1,35 @@
package com.nadoyagsa.pillaroid.service;

import com.nadoyagsa.pillaroid.component.JsoupComponent;
import com.nadoyagsa.pillaroid.component.MedicineCrawlUtil;
import com.nadoyagsa.pillaroid.component.MedicineExcelUtils;
import com.nadoyagsa.pillaroid.dto.Medicine;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.io.IOException;

@Service
public class MedicineService {
private final JsoupComponent jsoupComponent;
private final MedicineCrawlUtil medicineCrawlUtil;
private final MedicineExcelUtils medicineExcelUtils;

@Autowired
public MedicineService(JsoupComponent jsoupComponent) {
this.jsoupComponent = jsoupComponent;
public MedicineService(MedicineCrawlUtil medicineCrawlUtil, MedicineExcelUtils medicineExcelUtils) {
this.medicineCrawlUtil = medicineCrawlUtil;
this.medicineExcelUtils = medicineExcelUtils;
}

public Medicine getMedicineInfo() {
return jsoupComponent.getMedicineInfo("");
return medicineCrawlUtil.getMedicineInfo("");
}

public boolean updateMedicineInfoInExcel() {
try {
medicineExcelUtils.updateMedicineExcel();
return true;
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
}

0 comments on commit c03a32f

Please sign in to comment.