SW0000J

nate 크롤링 실패(한글깨짐)

1 -const axios = require("axios"); 1 +const axios = require("axios"); // 웹 서버 요청 모듈
2 -const cheerio = require("cheerio"); 2 +const cheerio = require("cheerio"); // load한 것을 jQuery처럼 사용
3 -const log = console.log; 3 +const Iconv = require('iconv').Iconv; // 한글 깨짐 방지
4 +const iconv = new Iconv('CP949', 'utf-8//translit//ignore');
5 +
6 +const url = "https://sports.news.nate.com/baseball/"
4 7
5 const getHtml = async () => { 8 const getHtml = async () => {
6 try { 9 try {
7 - return await axios.get("https://sports.news.naver.com/kbaseball/news/index.nhn?isphoto=N&type=latest"); 10 + return await axios.get(url);
8 } catch (error) { 11 } catch (error) {
9 console.error(error); 12 console.error(error);
10 } 13 }
...@@ -13,21 +16,22 @@ const getHtml = async () => { ...@@ -13,21 +16,22 @@ const getHtml = async () => {
13 getHtml() 16 getHtml()
14 .then(html => { 17 .then(html => {
15 let ulList = []; 18 let ulList = [];
16 - const $ = cheerio.load(html.data); 19 +
17 - const $bodyList = $("div.news_list ul").children("li"); 20 + const $ = cheerio.load(iconv.convert(html.data).toString()); //iconv.decode(cheerio.load(html.data), "EUC-KR").toString(); encoding이 EUC-KR로 되어있음
21 + const $bodyList = $("div.hotIssueCluster.timeline>div.cluster_box").children("div.cluster_basic");
18 22
19 $bodyList.each(function(i, elem) { 23 $bodyList.each(function(i, elem) {
20 ulList[i] = { 24 ulList[i] = {
21 - url: $(this).find('a').attr('href'), 25 + datetime: $(this).find('div.cluster_basic>div.mduCluster>div.mduWrap>div.mduBasic>a>span.origin em.date').text(),
22 - image_url: $(this).find('a.thmb img').attr('src'), 26 + url: $(this).find('div.cluster_basic > div.mduCluster > div.mduWrap > div.mduBasic > a').attr('href'),
23 - title: $(this).find('div.text a').text(), 27 + image_url: $(this).find('div.cluster_basic > div.mduCluster > div.mduWrap > div.mduBasic > a > span.mduimgArea > img').attr('src'),
24 - summary: $(this).find('div.text p').text(),//.slice(0, -29) 28 + title: $(this).find('div.cluster_basic > div.mduCluster > div.mduWrap > div.mduBasic > a > span.tit').text(),
25 - datetime: $(this).find('div.text div.source span').text() 29 + summary: $(this).find('div.cluster_basic > div.mduCluster > div.mduWrap > div.mduBasic > a > span.text').text()//.slice(0, -29)
26 }; 30 };
27 - console.log(ulList[i]) // list object checking code 31 + //console.log(ulList[i]) // list object checking code
28 }); 32 });
29 33
30 - const data = ulList.filter(n => n.title); 34 + const data = ulList;
31 return data; 35 return data;
32 //return ulList; 36 //return ulList;
33 }).then(res => console.log(res)); 37 }).then(res => console.log(res));
...\ No newline at end of file ...\ No newline at end of file
......