Showing
8 changed files
with
0 additions
and
360 deletions
src/lib/util.py
deleted
100644 → 0
1 | -from nltk.tokenize import word_tokenize, sent_tokenize | ||
2 | -import nltk | ||
3 | -import re | ||
4 | -from bs4 import BeautifulSoup | ||
5 | -from newspaper import Article | ||
6 | -import requests | ||
7 | - | ||
8 | - | ||
9 | -def get_HTML_from_url(url): | ||
10 | - return requests.get(url).text | ||
11 | - | ||
12 | - | ||
13 | -def get_text_from_HTML(html): | ||
14 | - soup = BeautifulSoup(html) | ||
15 | - script_tag = soup.find_all(['script', 'style', 'header', 'footer', 'form']) | ||
16 | - | ||
17 | - for script in script_tag: | ||
18 | - script.extract() | ||
19 | - content = soup.get_text('\n', strip=True) | ||
20 | - return content | ||
21 | - | ||
22 | - | ||
23 | -# def get_HTML_from_regexp_url(url_pattern): | ||
24 | - | ||
25 | - | ||
26 | -def is_string(target): | ||
27 | - return type(target) == str | ||
28 | - | ||
29 | - | ||
30 | -def cut_corpus(corpus): | ||
31 | - if not is_string(corpus): | ||
32 | - return [] | ||
33 | - return corpus.split('.')[:-1] | ||
34 | - | ||
35 | - | ||
36 | -def postag_sentence(sentence): | ||
37 | - if not is_string(sentence): | ||
38 | - return [] | ||
39 | - tags = word_tokenize(sentence) | ||
40 | - return nltk.pos_tag(tags) | ||
41 | - | ||
42 | - | ||
43 | -# verb의 index를 return 합니다. | ||
44 | -# 만약, 존재하지 않는다면, -1을 return 합니다. | ||
45 | -def find_verb_idx(tags): | ||
46 | - idx = 0 | ||
47 | - for tag in tags: | ||
48 | - if tag[0][1] == 'V': | ||
49 | - return idx | ||
50 | - return -1 | ||
51 | - | ||
52 | - | ||
53 | -def make_be_verb(subj): | ||
54 | - if subj == 'I': | ||
55 | - return 'am' | ||
56 | - elif subj in ['You', 'you']: | ||
57 | - return 'are' | ||
58 | - else: | ||
59 | - return 'is' | ||
60 | - | ||
61 | - | ||
62 | -def cut_quot(sentence): | ||
63 | - return re.sub("[\'\"\`]", '', sentence) | ||
64 | - | ||
65 | - | ||
66 | -# 예외 | ||
67 | -# 1. brace가 닫히지 않음 | ||
68 | -# 2. target_str가 없음 | ||
69 | -def make_brace_triple(target_str, brace_tags): | ||
70 | - if target_str == '': | ||
71 | - return [] | ||
72 | - idx = find_verb_idx(brace_tags) | ||
73 | - subj = target_str | ||
74 | - pred = '' | ||
75 | - if idx != -1: | ||
76 | - pred = brace_tags[idx] | ||
77 | - obj = ' '.join([value for value, _ in brace_tags[idx:]]) | ||
78 | - else: | ||
79 | - pred = make_be_verb(subj) | ||
80 | - obj = ' '.join([value for value, _ in brace_tags]) | ||
81 | - return [subj, pred, obj] | ||
82 | - | ||
83 | - | ||
84 | -url = 'https://en.wikipedia.org/wiki/Korea' | ||
85 | - | ||
86 | - | ||
87 | -def get_bodytext_from_url(url): | ||
88 | - news = Article(url, language='en') | ||
89 | - news.download() | ||
90 | - news.parse() | ||
91 | - text = news.text | ||
92 | - pattern = r'\[[^]]*\]' | ||
93 | - text = re.sub(pattern=pattern, repl='', string=text) | ||
94 | - return text |
src/lib/web2rdf/README.md
deleted
100644 → 0
This diff is collapsed. Click to expand it.
No preview for this file type
No preview for this file type
1 | -package test_package; | ||
2 | -import edu.stanford.nlp.coref.CorefCoreAnnotations; | ||
3 | -import edu.stanford.nlp.coref.data.CorefChain; | ||
4 | -import edu.stanford.nlp.coref.data.Dictionaries; | ||
5 | -import edu.stanford.nlp.coref.data.Mention; | ||
6 | -import edu.stanford.nlp.ie.util.RelationTriple; | ||
7 | -import edu.stanford.nlp.io.IOUtils; | ||
8 | -import edu.stanford.nlp.ling.CoreAnnotations; | ||
9 | -import edu.stanford.nlp.ling.CoreLabel; | ||
10 | -import edu.stanford.nlp.naturalli.NaturalLogicAnnotations; | ||
11 | -import edu.stanford.nlp.naturalli.OpenIE; | ||
12 | -import edu.stanford.nlp.naturalli.SentenceFragment; | ||
13 | -import edu.stanford.nlp.pipeline.Annotation; | ||
14 | -import edu.stanford.nlp.pipeline.CoreDocument; | ||
15 | -import edu.stanford.nlp.pipeline.CoreEntityMention; | ||
16 | -import edu.stanford.nlp.pipeline.StanfordCoreNLP; | ||
17 | -import edu.stanford.nlp.semgraph.SemanticGraph; | ||
18 | -import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; | ||
19 | -import edu.stanford.nlp.tagger.maxent.MaxentTagger; | ||
20 | -import edu.stanford.nlp.util.CoreMap; | ||
21 | -import edu.stanford.nlp.util.IntPair; | ||
22 | -import edu.stanford.nlp.util.PropertiesUtils; | ||
23 | -import org.apache.jena.rdf.model.*; | ||
24 | -import org.jsoup.Jsoup; | ||
25 | -import org.jsoup.nodes.Document; | ||
26 | -import org.jsoup.select.Elements; | ||
27 | -import java.io.IOException; | ||
28 | -import java.util.*; | ||
29 | -import java.util.stream.Collectors; | ||
30 | - | ||
31 | -public class main { | ||
32 | - public static void main(String[] args) throws IOException { | ||
33 | - //Jsoup 파싱 | ||
34 | - Document doc = Jsoup.connect("https://en.wikipedia.org/wiki/Korea").get(); | ||
35 | - Elements pTags = doc.getElementsByTag("p"); | ||
36 | - String bodyText = Jsoup.parse(pTags.toString()).text(); | ||
37 | - | ||
38 | -// //stanford pos tagger 형태소 분석기를 이용한 태깅 | ||
39 | -// MaxentTagger tagger = new MaxentTagger("taggers/english-left3words-distsim.tagger"); | ||
40 | -// String tagged = tagger.tagString(bodyText); | ||
41 | -// | ||
42 | -// //핵심 고유단어 추출작업 | ||
43 | -// String[] taggedArr = tagged.split(" "); | ||
44 | -// List<String> nnp = Arrays.stream(taggedArr).filter(word -> word.contains("_NNP")).collect(Collectors.toList()); | ||
45 | -// Hashtable<String,Integer> freqOfWordTable = new Hashtable<>(); | ||
46 | -// for (String word : nnp) { | ||
47 | -// Integer freq = freqOfWordTable.get(word); // 단어를 꺼낸다. word가 key이고 freq가 value | ||
48 | -// freqOfWordTable.put(word, (freq == null) ? 1: freq +1); | ||
49 | -// } | ||
50 | -// List sortedList = sortByValue(freqOfWordTable); | ||
51 | -// String coreNoun = sortedList.get(0).toString(); | ||
52 | - | ||
53 | -// //트리플 추출 과정 | ||
54 | -// String[] sentences = tagged.split("\\._\\."); | ||
55 | -// List<String[]> tripples = new ArrayList<>(); | ||
56 | -// for (String sentence : sentences){ | ||
57 | -// if (sentence.contains(coreNoun)) { | ||
58 | -// String[] words = sentence.split(" "); | ||
59 | -// String subject = ""; | ||
60 | -// String predicate = ""; | ||
61 | -// String object = ""; | ||
62 | -// for (String word:words) { | ||
63 | -// if(word.equals(coreNoun)) { | ||
64 | -// String[] removeTag = word.split("_"); | ||
65 | -// subject = removeTag[0]; | ||
66 | -// }else if(word.contains("_VB") && !subject.isEmpty()) { | ||
67 | -// String[] removeTag = word.split("_"); | ||
68 | -// predicate = removeTag[0]; | ||
69 | -// }else if(word.contains("_NNP") && !predicate.isEmpty()) { | ||
70 | -// String[] removeTag = word.split("_"); | ||
71 | -// object = removeTag[0]; | ||
72 | -// } | ||
73 | -// if(!subject.isEmpty() && !predicate.isEmpty() && !object.isEmpty()){ | ||
74 | -// String[] tripple = {subject,predicate,object}; | ||
75 | -// tripples.add(tripple); | ||
76 | -// } | ||
77 | -// } | ||
78 | -// } | ||
79 | -// } | ||
80 | - | ||
81 | -// // Jena로 RDF 추출 | ||
82 | -// Model model = ModelFactory.createDefaultModel(); | ||
83 | -// for(String[] statement : tripples){ | ||
84 | -// Resource s = model.createResource("http://subject/"+statement[0]); | ||
85 | -// Property p = model.createProperty("http://predicate/"+statement[1]); | ||
86 | -// RDFNode o = model.createLiteral(statement[2]); | ||
87 | -// | ||
88 | -// if(s.hasProperty(p)){ | ||
89 | -// s.addProperty(p,model.createResource().addProperty(p,o)); | ||
90 | -// }else { | ||
91 | -// s.addProperty(p,o); | ||
92 | -// } | ||
93 | -// } | ||
94 | -// model.write(System.out); | ||
95 | -// //RDFDataMgr.write(System.out, model, Lang.NTRIPLES); // N-TRIPLES 형태로 출력 | ||
96 | - | ||
97 | - // stanford OpenIE | ||
98 | - Properties props = PropertiesUtils.asProperties( | ||
99 | - "annotators", "tokenize,ssplit,pos,lemma,ner,parse,coref" | ||
100 | - ); | ||
101 | - StanfordCoreNLP pipeline = new StanfordCoreNLP(props); | ||
102 | - | ||
103 | - String text; | ||
104 | - if (args.length > 0) { | ||
105 | - text = IOUtils.slurpFile(args[0]); | ||
106 | - } else { | ||
107 | - text = "Korea (officially the \"Korean Peninsula\") is a region in East Asia. Since 1945 it has been divided into the two parts which soon became the two sovereign states: North Korea (officially the \"Democratic People's Republic of Korea\") and South Korea (officially the \"Republic of Korea\"). Korea consists of the Korean Peninsula, Jeju Island, and several minor islands near the peninsula. It is bordered by China to the northwest and Russia to the northeast. It is separated from Japan to the east by the Korea Strait and the Sea of Japan (East Sea). During the first half of the 1st millennium, Korea was divided between the three competing states of Goguryeo, Baekje, and Silla, together known as the Three Kingdoms of Korea."; | ||
108 | - //text = bodyText; | ||
109 | - } | ||
110 | - | ||
111 | - | ||
112 | - Annotation docu = new Annotation(text); | ||
113 | - pipeline.annotate(docu); | ||
114 | - List<String> sentList = new ArrayList<>(); | ||
115 | - for (CoreMap sentence : docu.get(CoreAnnotations.SentencesAnnotation.class)) { | ||
116 | - sentList.add(sentence.get(CoreAnnotations.TextAnnotation.class)); | ||
117 | - } | ||
118 | - | ||
119 | - | ||
120 | - | ||
121 | - String newText = ""; | ||
122 | - Collection<CorefChain> values = docu.get(CorefCoreAnnotations.CorefChainAnnotation.class).values(); | ||
123 | - for (CorefChain cc : values) { | ||
124 | - //System.out.println("\t" + cc.getMentionsInTextualOrder()); | ||
125 | - List<CorefChain.CorefMention> mentionsInTextualOrder = cc.getMentionsInTextualOrder(); | ||
126 | - String coreWord = ""; | ||
127 | - for (int i = 0; i < mentionsInTextualOrder.size(); i++){ | ||
128 | - if (i == 0){ | ||
129 | - coreWord = mentionsInTextualOrder.get(i).mentionSpan; // 첫번째 명사를 원래 명사로 지정 | ||
130 | - } | ||
131 | - String mention = mentionsInTextualOrder.get(i).mentionSpan; // 대명사 가져오기 | ||
132 | - int sentNum = mentionsInTextualOrder.get(i).sentNum -1; //문장 번호 가져오기 | ||
133 | - String modiSent = sentList.get(sentNum); // 수정될 문장 가져오고 | ||
134 | - modiSent = modiSent.replaceAll(mention,coreWord); // mention(대명사를) coreWord(원래단어)로 바꿔주고 | ||
135 | - sentList.set(sentNum,modiSent); // 수정된 문자열로 바꿔줌 | ||
136 | - } | ||
137 | - } | ||
138 | - | ||
139 | - //System.out.println(sentList); | ||
140 | - | ||
141 | - for (String s : sentList) { | ||
142 | - newText += s + " "; | ||
143 | - } | ||
144 | - System.out.println(text); | ||
145 | - System.out.println("--------------------------------------------"); | ||
146 | - System.out.println(newText); | ||
147 | - | ||
148 | - System.out.println("\n \n"); | ||
149 | - | ||
150 | - | ||
151 | - props = PropertiesUtils.asProperties( | ||
152 | - "annotators", "tokenize,ssplit,pos,lemma,parse,natlog,openie" | ||
153 | - ); | ||
154 | - props.setProperty("openie.max_entailments_per_clause","100"); | ||
155 | - props.setProperty("openie.triple.strict","false"); | ||
156 | - pipeline = new StanfordCoreNLP(props); | ||
157 | - | ||
158 | - docu = new Annotation(newText); | ||
159 | - pipeline.annotate(docu); | ||
160 | - int sentNo = 0; | ||
161 | - for (CoreMap sentence : docu.get(CoreAnnotations.SentencesAnnotation.class)) { | ||
162 | - System.out.println("Sentence #" + ++sentNo + ": " + sentence.get(CoreAnnotations.TextAnnotation.class)); | ||
163 | - | ||
164 | -// // Print SemanticGraph | ||
165 | -// System.out.println(sentence.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class).toString(SemanticGraph.OutputFormat.LIST)); | ||
166 | - | ||
167 | - // Get the OpenIE triples for the sentence | ||
168 | - Collection<RelationTriple> triples = sentence.get(NaturalLogicAnnotations.RelationTriplesAnnotation.class); | ||
169 | - | ||
170 | - // Print the triples | ||
171 | - for (RelationTriple triple : triples) { | ||
172 | - System.out.println(triple.confidence + "\t" + | ||
173 | - "<"+triple.subjectGloss()+">" + "\t" + | ||
174 | - "<"+triple.relationGloss()+">" + "\t" + | ||
175 | - "<"+triple.objectGloss()+">"); | ||
176 | - } | ||
177 | - System.out.println("\n"); | ||
178 | - // Alternately, to only run e.g., the clause splitter: | ||
179 | -// List<SentenceFragment> clauses = new OpenIE(props).clausesInSentence(sentence); | ||
180 | -// for (SentenceFragment clause : clauses) { | ||
181 | -// System.out.println(clause.parseTree.toString(SemanticGraph.OutputFormat.LIST)); | ||
182 | -// } | ||
183 | -// System.out.println(); | ||
184 | - } | ||
185 | - | ||
186 | - } | ||
187 | - //map 정렬 메소드 | ||
188 | - public static List sortByValue(final Map map) { | ||
189 | - List<String> list = new ArrayList(); | ||
190 | - list.addAll(map.keySet()); | ||
191 | - Collections.sort(list,new Comparator() { | ||
192 | - public int compare(Object o1,Object o2) { | ||
193 | - Object v1 = map.get(o1); | ||
194 | - Object v2 = map.get(o2); | ||
195 | - return ((Comparable) v2).compareTo(v1); | ||
196 | - } | ||
197 | - }); | ||
198 | - //Collections.reverse(list); // 주석시 오름차순 | ||
199 | - return list; | ||
200 | - } | ||
201 | -} |
This file is too large to display.
1 | -model = /u/nlp/data/pos-tagger/models-4.0.0/models/english-left3words-distsim-prod1.tagger | ||
2 | -arch = left3words,wordshapes(-1,1),distsim(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),distsimconjunction(/u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters,-1,1),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorUCase),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorCNumber),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorDash),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorLetterDigitDash),rareExtractor(edu.stanford.nlp.tagger.maxent.CompanyNameDetector),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorAllCapitalized),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorUpperDigitDash),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorStartSentenceCap),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorMidSentenceCapC),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorMidSentenceCap),prefix(10),suffix(10),unicodeshapes(0),rareExtractor(edu.stanford.nlp.tagger.maxent.ExtractorNonAlphanumeric) | ||
3 | -wordFunction = edu.stanford.nlp.process.AmericanizeFunction | ||
4 | -trainFile = /u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/craft-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/ewt-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/questionbank-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/train-currency.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/handparsed-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/ontonotes-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/wsj-train.txt;/u/nlp/data/pos-tagger/models-4.0.0/data/experiments/english-left3words-distsim-prod1/train/train-tech-english.txt | ||
5 | -closedClassTags = | ||
6 | -closedClassTagThreshold = 40 | ||
7 | -curWordMinFeatureThresh = 2 | ||
8 | -debug = false | ||
9 | -debugPrefix = | ||
10 | -tagSeparator = _ | ||
11 | -encoding = UTF-8 | ||
12 | -iterations = 100 | ||
13 | -lang = english | ||
14 | -learnClosedClassTags = false | ||
15 | -minFeatureThresh = 2 | ||
16 | -openClassTags = | ||
17 | -rareWordMinFeatureThresh = 5 | ||
18 | -rareWordThresh = 5 | ||
19 | -search = owlqn | ||
20 | -sgml = false | ||
21 | -sigmaSquared = 0.5 | ||
22 | -regL1 = 0.75 | ||
23 | -tagInside = | ||
24 | -tokenize = true | ||
25 | -tokenizerFactory = | ||
26 | -tokenizerOptions = | ||
27 | -verbose = false | ||
28 | -verboseResults = true | ||
29 | -veryCommonWordThresh = 250 | ||
30 | -xmlInput = | ||
31 | -outputFile = | ||
32 | -outputFormat = slashTags | ||
33 | -outputFormatOptions = | ||
34 | -nthreads = 1 | ||
35 | -minWordsLockTags = 1 |
src/lib/web2rdf/web2rdf.iml
deleted
100644 → 0
1 | -<?xml version="1.0" encoding="UTF-8"?> | ||
2 | -<module type="JAVA_MODULE" version="4"> | ||
3 | - <component name="NewModuleRootManager" inherit-compiler-output="true"> | ||
4 | - <exclude-output /> | ||
5 | - <content url="file://$MODULE_DIR$"> | ||
6 | - <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" /> | ||
7 | - </content> | ||
8 | - <orderEntry type="inheritedJdk" /> | ||
9 | - <orderEntry type="sourceFolder" forTests="false" /> | ||
10 | - <orderEntry type="module-library" exported=""> | ||
11 | - <library> | ||
12 | - <CLASSES> | ||
13 | - <root url="jar://$USER_HOME$/Desktop/stanford-postagger-full-2020-11-17/stanford-postagger-4.2.0.jar!/" /> | ||
14 | - </CLASSES> | ||
15 | - <JAVADOC /> | ||
16 | - <SOURCES /> | ||
17 | - </library> | ||
18 | - </orderEntry> | ||
19 | - <orderEntry type="module-library" exported=""> | ||
20 | - <library> | ||
21 | - <CLASSES> | ||
22 | - <root url="jar://$USER_HOME$/Desktop/stanford-postagger-full-2020-11-17/jsoup-1.13.1.jar!/" /> | ||
23 | - </CLASSES> | ||
24 | - <JAVADOC /> | ||
25 | - <SOURCES /> | ||
26 | - </library> | ||
27 | - </orderEntry> | ||
28 | - <orderEntry type="library" exported="" name="lib" level="project" /> | ||
29 | - </component> | ||
30 | -</module> | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment