import
java.io.BufferedReader;
import
java.io.IOException;
import
java.io.InputStream;
import
java.io.InputStreamReader;
import
java.net.MalformedURLException;
import
java.net.URL;
import
java.util.ArrayList;
import
java.util.Collections;
import
java.util.List;
import
java.util.StringTokenizer;
import
kr.co.shineware.nlp.komoran.core.analyzer.Komoran;
import
kr.co.shineware.util.common.model.Pair;
public
class
RssAnalyticsExample {
public
static
void
main(String[] args) {
Komoran komoran =
new
Komoran(
"D:/Projects/komoran/models-full"
);
URL url =
null
;
InputStream is =
null
;
BufferedReader br =
null
;
ArrayList<Word> wordList =
new
ArrayList<>();
try
{
url =
new
URL(
"http://rss.hankooki.com/daily/dh_main.xml"
);
is = url.openStream();
br =
new
BufferedReader(
new
InputStreamReader(is,
"EUC-KR"
));
int
readCount = -
1
;
char
[] buffer =
new
char
[
1000
];
while
( (readCount=br.read(buffer))!= -
1
) {
String str =
new
String(buffer,
0
, readCount);
str = str.replaceAll(
"<pubDate>.*</pubDate>"
,
""
);
str = str.replaceAll(
"...CDATA."
,
""
);
str = str.replaceAll(
"]]>"
,
""
);
str = str.replaceAll(
"<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>"
,
""
);
StringTokenizer tokens =
new
StringTokenizer(str,
" \t\n\r,.'\"-=%…()[]{}“▷+ⓒ"
);
while
(tokens.hasMoreTokens()) {
String token = tokens.nextToken();
List<List<Pair<String,String>>> result = komoran.analyze(token);
List<Pair<String,String>> pairs = (List<Pair<String,String>>)result.get(
0
);
Pair<String, String> pair = (Pair<String, String>)pairs.get(
0
);
if
(pair.getSecond().equals(
"NNP"
)) {
token = pair.getFirst();
}
boolean
isTokenExist =
false
;
for
(
int
i=
0
; i<wordList.size(); i++) {
if
(token.equals(wordList.get(i).getWord())) {
wordList.get(i).setCount(wordList.get(i).getCount()+
1
);
isTokenExist =
true
;
}
}
if
(!isTokenExist) {
wordList.add(
new
Word(token));
}
}
}
Collections.sort(wordList);
for
(Word word : wordList) {
System.out.println(word);
}
}
catch
(MalformedURLException e) {
System.out.println(
"잘못된 주소입니다."
);
}
catch
(IOException e) {
System.out.println(
"입력 스트림을 열 수 없습니다."
);
}
finally
{
}
}
}
class
Word
implements
Comparable<Word> {
private
String word;
private
int
count;
public
Word(String word) {
super
();
this
.word = word;
this
.count =
1
;
}
public
String getWord() {
return
word;
}
public
void
setWord(String word) {
this
.word = word;
}
public
int
getCount() {
return
count;
}
public
void
setCount(
int
count) {
this
.count = count;
}
@Override
public
int
hashCode() {
final
int
prime =
31
;
int
result =
1
;
result = prime * result + ((word ==
null
) ?
0
: word.hashCode());
return
result;
}
@Override
public
boolean
equals(Object obj) {
if
(
this
== obj)
return
true
;
if
(obj ==
null
)
return
false
;
if
(getClass() != obj.getClass())
return
false
;
Word other = (Word) obj;
if
(word ==
null
) {
if
(other.word !=
null
)
return
false
;
}
else
if
(!word.equals(other.word))
return
false
;
return
true
;
}
@Override
public
String toString() {
return
"Word [word="
+ word +
", count="
+ count +
"]"
;
}
@Override
public
int
compareTo(Word o) {
return
this
.count - o.getCount();
}
}