diff --git a/src/main/java/com/nadoyagsa/pillaroid/component/JsoupComponent.java b/src/main/java/com/nadoyagsa/pillaroid/component/JsoupComponent.java deleted file mode 100644 index 55cb7a5..0000000 --- a/src/main/java/com/nadoyagsa/pillaroid/component/JsoupComponent.java +++ /dev/null @@ -1,108 +0,0 @@ -package com.nadoyagsa.pillaroid.component; - -import com.nadoyagsa.pillaroid.dto.Appearance; -import com.nadoyagsa.pillaroid.dto.Medicine; -import org.jsoup.Connection; -import org.jsoup.HttpStatusException; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; -import org.springframework.stereotype.Component; - -import java.io.IOException; - -@Component -public class JsoupComponent { - public Medicine getMedicineInfo(String url) { - final String medicineUrl = url; - - Connection conn = Jsoup.connect(medicineUrl); - try { - Document document = conn.get(); - - Elements nameElements = document.getElementsByClass("stress"); - if (nameElements.size() > 0) { - Element parent = nameElements.get(0).parent(); - // 네이버 의약품 검색 크롤링 중

로 인해 잘린 부분이 있을 수 있어서 다음으로 변경 - String parentProcessed = parent.html() - .replace("

\n

", "") - .replace("

", "") - .replaceAll("]*>\\s*

", ""); //p 태그 중 속성값이 있는데 내용은 없는 컴포넌트 삭제 - - Document documentProcessed = Jsoup.parse(parentProcessed); - - return getMedicineInfo(documentProcessed); - } - return getMedicineInfo(document); - } catch (HttpStatusException e) { - return Medicine.builder().build(); - } catch (IOException e) { - e.printStackTrace(); - } - - return null; - } - - public Medicine getMedicineInfo(Document document) { // 지식백과에서의 의약품명 크롤링 - Medicine medicine = new Medicine(); - - Elements nameElements = document.getElementsByClass("stress"); - for (Element nameElement : nameElements) { // 외형정보, 성분정보, 저장방법, 효능효과, 용법용량, 사용상 주의사항 - String topic = nameElement.text(); - Element textElement = nameElement.nextElementSibling(); - - if (textElement != null) { - String text = textElement.html() - .replace("
", "\n") // 줄바꿈 모두 저장 - .replaceAll("\\[허가사항변경[^]]*]\n*", "") // [허가사항변경] 관련 불필요 내용 제거 - .replaceAll("<[^>]*>", "") // 태그 모두 제거 - .strip(); - - switch (topic) { - case "외형정보": - Appearance appearanceInfo = new Appearance(); - - String[] splitTopic = text.split("·"); - for (String subTopic : splitTopic) { - String[] information = subTopic.split(":"); - - if (information.length <= 1) - continue; - - if (information[0].contains("성상")) - appearanceInfo.setAppearance(information[1].trim()); - else if (information[0].contains("제형")) - appearanceInfo.setFormulation(information[1].trim()); - else if (information[0].contains("모양")) - appearanceInfo.setShape(information[1].trim()); - else if (information[0].contains("색상")) - appearanceInfo.setColor(information[1].trim()); - else if (information[0].contains("분할선")) - appearanceInfo.setDividingLine(information[1].trim()); - else if (information[0].contains("식별표기")) - appearanceInfo.setIdentificationMark(information[1].trim()); - } - medicine.setAppearanceInfo(appearanceInfo); - break; - case "성분정보": - medicine.setIngredient(text); - break; - case "저장방법": - medicine.setSave(text); - break; - case "효능효과": - medicine.setEfficacy(text); - break; - case "용법용량": - medicine.setUsage(text); - break; - case "사용상 주의사항": - medicine.setPrecautions(text); - break; - } - } - } - return medicine; - } -} diff --git a/src/main/java/com/nadoyagsa/pillaroid/component/MedicineCrawlUtil.java b/src/main/java/com/nadoyagsa/pillaroid/component/MedicineCrawlUtil.java new file mode 100644 index 0000000..b856af0 --- /dev/null +++ b/src/main/java/com/nadoyagsa/pillaroid/component/MedicineCrawlUtil.java @@ -0,0 +1,114 @@ +package com.nadoyagsa.pillaroid.component; + +import com.nadoyagsa.pillaroid.dto.Appearance; +import com.nadoyagsa.pillaroid.dto.Medicine; +import org.jsoup.Connection; +import org.jsoup.HttpStatusException; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.springframework.stereotype.Component; + +import java.io.IOException; + +@Component +public class MedicineCrawlUtil { + public Medicine getMedicineInfo(String medicineUrl) { + Connection conn = Jsoup.connect(medicineUrl); + try { + Document document = conn.get(); + + Elements nameElements = document.getElementsByClass("stress"); + if (nameElements.size() > 0) { + Element parent = nameElements.get(0).parent(); + // 네이버 의약품 검색 크롤링 중

로 인해 잘린 부분이 있을 수 있어서 다음으로 변경 + String parentProcessed = parent.html() + .replace("

\n

", "") + .replace("

", "") + .replaceAll("]*>\\s*

", ""); //p 태그 중 속성값이 있는데 내용은 없는 컴포넌트 삭제 + + Document documentProcessed = Jsoup.parse(parentProcessed); + + return getMedicineInfo(documentProcessed); + } + return getMedicineInfo(document); + } catch (HttpStatusException e) { + return Medicine.builder().build(); + } catch (IOException e) { + e.printStackTrace(); + } + + return null; + } + + public Medicine getMedicineInfo(Document document) { // 지식백과에서의 의약품명 크롤링 + Medicine medicine = new Medicine(); + + Elements nameElements = document.getElementsByClass("stress"); + for (Element nameElement : nameElements) { // 외형정보, 성분정보, 저장방법, 효능효과, 용법용량, 사용상 주의사항 + String topic = nameElement.text(); + Element textElement = nameElement.nextElementSibling(); + + if (textElement != null) { + if (topic.equals("외형정보")) { + Appearance appearanceInfo = new Appearance(); + + String[] splitTopic = textElement.html() + .split(""); // 외형정보 안의 소주제를 나눔 + + for (String subTopic : splitTopic) { + String subText = subTopic + .replace("
", "\n") + .replaceAll("<[^>]*>", "").strip(); + + String[] information = subText.split(":", 2); + + if (information.length <= 1) + continue; + + if (information[0].contains("성상")) + appearanceInfo.setAppearance(information[1].trim()); + else if (information[0].contains("제형")) + appearanceInfo.setFormulation(information[1].trim()); + else if (information[0].contains("모양")) + appearanceInfo.setShape(information[1].trim()); + else if (information[0].contains("색상")) + appearanceInfo.setColor(information[1].trim()); + else if (information[0].contains("분할선")) + appearanceInfo.setDividingLine(information[1].trim()); + else if (information[0].contains("식별표기")) + appearanceInfo.setIdentificationMark(information[1].trim()); + } + medicine.setAppearanceInfo(appearanceInfo); + } + else { + String text = textElement.html() + .replace("
", "\n") // 줄바꿈 모두 저장 + .replaceAll("\\[허가사항변경[^]]*]\n*", "") // [허가사항변경] 관련 불필요 내용 제거 + .replaceAll("<[^>]*>", "") // 태그 모두 제거 + .strip(); + + switch (topic) { + case "성분정보": + medicine.setIngredient(text); + break; + case "저장방법": + medicine.setSave(text); + break; + case "효능효과": + medicine.setEfficacy(text); + break; + case "용법용량": + medicine.setUsage(text); + break; + case "사용상 주의사항": + medicine.setPrecautions(text); + break; + } + } + } + } + return medicine; + } +} diff --git a/src/main/java/com/nadoyagsa/pillaroid/component/MedicineExcelUtils.java b/src/main/java/com/nadoyagsa/pillaroid/component/MedicineExcelUtils.java index 491b423..0d013a2 100644 --- a/src/main/java/com/nadoyagsa/pillaroid/component/MedicineExcelUtils.java +++ b/src/main/java/com/nadoyagsa/pillaroid/component/MedicineExcelUtils.java @@ -27,7 +27,7 @@ @Component @RequiredArgsConstructor public class MedicineExcelUtils { - private final JsoupComponent jsoupComponent; + private final MedicineCrawlUtil medicineCrawlUtil; public void updateMedicineExcel() throws IOException { @@ -113,7 +113,7 @@ private HashMap crawlMedicineInfo(String productLink) { String detailBaseUrl = "https://terms.naver.com"; String detailUrl = detailBaseUrl + productLink; - Medicine medicineInfo = jsoupComponent.getMedicineInfo(detailUrl); + Medicine medicineInfo = medicineCrawlUtil.getMedicineInfo(detailUrl); //(수정할 colIdx, content)로 된 hashMap HashMap result = new HashMap<>(); diff --git a/src/main/java/com/nadoyagsa/pillaroid/service/MedicineService.java b/src/main/java/com/nadoyagsa/pillaroid/service/MedicineService.java index 412dde9..b63c288 100644 --- a/src/main/java/com/nadoyagsa/pillaroid/service/MedicineService.java +++ b/src/main/java/com/nadoyagsa/pillaroid/service/MedicineService.java @@ -1,20 +1,35 @@ package com.nadoyagsa.pillaroid.service; -import com.nadoyagsa.pillaroid.component.JsoupComponent; +import com.nadoyagsa.pillaroid.component.MedicineCrawlUtil; +import com.nadoyagsa.pillaroid.component.MedicineExcelUtils; import com.nadoyagsa.pillaroid.dto.Medicine; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import java.io.IOException; + @Service public class MedicineService { - private final JsoupComponent jsoupComponent; + private final MedicineCrawlUtil medicineCrawlUtil; + private final MedicineExcelUtils medicineExcelUtils; @Autowired - public MedicineService(JsoupComponent jsoupComponent) { - this.jsoupComponent = jsoupComponent; + public MedicineService(MedicineCrawlUtil medicineCrawlUtil, MedicineExcelUtils medicineExcelUtils) { + this.medicineCrawlUtil = medicineCrawlUtil; + this.medicineExcelUtils = medicineExcelUtils; } public Medicine getMedicineInfo() { - return jsoupComponent.getMedicineInfo(""); + return medicineCrawlUtil.getMedicineInfo(""); + } + + public boolean updateMedicineInfoInExcel() { + try { + medicineExcelUtils.updateMedicineExcel(); + return true; + } catch (IOException e) { + e.printStackTrace(); + return false; + } } }