과제(240604)

## 과제 : 웹 크롤러 구현 목표 : 웹 크롤러를 구현하여 웹 페이지의 정보를 수집하고, 이를 활용하여 웹 페이지를 분석하는 방법을 이해한다. 요구사함 : Yes24의 베스트 셀러 링크로부터 제목, 저자, 가격 정보를 출력하는 웹 크롤러를 구현한다. Yes24의 베스트 셀러 링크는 다음과 같다. [https://www.yes24.com/Product/Category/BestSeller?categoryNumber=001&Gcode=000_002_012](https://www.yes24.com/Product/Category/BestSeller?categoryNumber=001&Gcode=000_002_012) ## 과제 : 웹 크롤러 구현 답안 예 ```java import java.io.IOException; import java.net.URI; import java.net.http.HttpClient; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.util.Arrays; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Yes24BestSellerCrawler { private static final Pattern NAME_PARSER = Pattern.compile("<a class=\"gd_name\"[^>]*>(.*?)</a>"); private static final Pattern AUTH_PARSER = Pattern.compile(">(.*?)</a> 저"); private static final Pattern PRICE_PARSER = Pattern.compile(">([\\d,]+)</em>원</strong>"); public static void main(String[] args) { HttpClient client = HttpClient.newHttpClient(); HttpRequest request = HttpRequest.newBuilder() .uri(URI.create("https://www.yes24.com/Product/Category/BestSeller?categoryNumber=001&Gcode=000_002_012")) .build(); try { HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString()); String[] split = response.body().split("<li data-goods-no=.*"); System.out.println("| 제목 | 저자 | 가격 |"); System.out.println("| --- | --- | --- |"); Arrays.stream(split) .filter(o -> o.contains("gd_name")) .forEach(o -> { Matcher nameMatcher = NAME_PARSER.matcher(o); Matcher authMatcher = AUTH_PARSER.matcher(o); Matcher priceMatcher = PRICE_PARSER.matcher(o); if (nameMatcher.find() && authMatcher.find() && priceMatcher.find()) { System.out.println("| " + nameMatcher.group(1) + " | " + authMatcher.group(1) + " | " + priceMatcher.group(1) + " |"); } }); } catch (IOException | InterruptedException e) { e.printStackTrace(); } } } ```