Commit d80c569e by liuzhangyiding

修改语料文件加载代码,语料支持放在同一个文件

parent 04a33d18
吃米饭 吃米饭
吃米饭 吃米饭
\ No newline at end of file 叫个外卖
叫一份外卖
吃杂酱面
叫个外卖
吃午饭
点个外卖
叫个外卖
订个外卖
需要一份外卖
点个米饭
叫个外卖
点个外卖
\ No newline at end of file
点个外卖 ++ /dev/null
点个外卖
\ No newline at end of file
吃杂酱面 ++ /dev/null
吃杂酱面
\ No newline at end of file
叫一份外卖 ++ /dev/null
叫一份外卖
\ No newline at end of file
叫个外卖 ++ /dev/null
叫个外卖
\ No newline at end of file
叫个外卖 ++ /dev/null
叫个外卖
\ No newline at end of file
点个米饭 ++ /dev/null
点个米饭
\ No newline at end of file
需要一份外卖 ++ /dev/null
需要一份外卖
\ No newline at end of file
订个外卖 ++ /dev/null
订个外卖
\ No newline at end of file
叫个外卖 ++ /dev/null
叫个外卖
\ No newline at end of file
点个外卖 ++ /dev/null
点个外卖
\ No newline at end of file
吃午饭 ++ /dev/null
吃午饭
\ No newline at end of file
叫个外卖 ++ /dev/null
叫个外卖
\ No newline at end of file
打车 打车
打车 打车
\ No newline at end of file 叫一辆车
叫个出租
打车去体育场
叫辆车
坐车
叫辆出租车
打一辆车
叫个车
需要一辆车
打车去大雁塔
叫个车
\ No newline at end of file
叫一辆车 ++ /dev/null
叫一辆车
\ No newline at end of file
叫个车 ++ /dev/null
叫个车
\ No newline at end of file
打车去大雁塔 ++ /dev/null
打车去大雁塔
\ No newline at end of file
叫个出租 ++ /dev/null
叫个出租
\ No newline at end of file
打车去体育场 ++ /dev/null
打车去体育场
\ No newline at end of file
叫辆车 ++ /dev/null
叫辆车
\ No newline at end of file
坐车 ++ /dev/null
坐车
\ No newline at end of file
叫辆出租车 ++ /dev/null
叫辆出租车
\ No newline at end of file
打一辆车 ++ /dev/null
打一辆车
\ No newline at end of file
叫个车 ++ /dev/null
叫个车
\ No newline at end of file
需要一辆车 ++ /dev/null
需要一辆车
\ No newline at end of file
...@@ -11,15 +11,15 @@ ...@@ -11,15 +11,15 @@
1.0 1.0
1.0 1.0
1.0 1.0
1.0
2.0 2.0
1.0
2.0 2.0
1.0
2.0 2.0
2.0 2.0
2.0 2.0
2.0 2.0
2.0 2.0
2.0 2.0
1.0
2.0 2.0
2.0 2.0
...@@ -3,29 +3,29 @@ kernel_type rbf ...@@ -3,29 +3,29 @@ kernel_type rbf
gamma 0.07142857142857142 gamma 0.07142857142857142
nr_class 2 nr_class 2
total_sv 22 total_sv 22
rho 0.3543789360023631 rho 0.3544510261690456
label 1 2 label 1 2
nr_sv 10 12 nr_sv 10 12
SV SV
1.0 10:3.6438561897747253 1.0 9:3.0588936890535687
1.0 8:1.5294468445267844 3:0.6609640474436812 1.0 8:1.5294468445267844 3:0.6609640474436812
1.0 13:1.8219280948873626 9:1.5294468445267844 0.9903141391672972 3:1.3219280948873624
0.9893662802090073 3:1.3219280948873624
1.0 3:1.3219280948873624 1.0 3:1.3219280948873624
1.0 5:3.6438561897747253
1.0 3:1.3219280948873624 1.0 3:1.3219280948873624
1.0 9:3.0588936890535687
1.0 3:1.3219280948873624 1.0 3:1.3219280948873624
1.0 10:3.6438561897747253
1.0 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745 1.0 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745
-1.0 1.0 13:1.8219280948873626 9:1.5294468445267844
1.0 5:3.6438561897747253
-0.6700922884685501 14:2.643856189774725
-0.2563835238113437 11:3.0588936890535687 -0.2563835238113437 11:3.0588936890535687
-1.0 -1.0
-1.0 14:1.3219280948873624 12:1.8219280948873626
-1.0 1:1.8219280948873626 14:1.3219280948873624
-0.07411630357738502 4:2.643856189774725
-0.6694907707158689 14:2.643856189774725
-1.0 6:3.6438561897747253
-0.9972716531392096 4:2.643856189774725
-1.0 2:3.6438561897747253 -1.0 2:3.6438561897747253
-0.9921040289652 11:3.0588936890535687 -0.07237134131172171 4:2.643856189774725
-1.0 4:1.3219280948873624 7:1.5294468445267844 -1.0 4:1.3219280948873624 7:1.5294468445267844
-1.0 1:1.8219280948873626 14:1.3219280948873624
-0.9914669855756817 11:3.0588936890535687
-1.0 14:1.3219280948873624 12:1.8219280948873626
-1.0
-1.0 4:2.643856189774725
-1.0 6:3.6438561897747253
1 10:3.6438561897747253 1 9:3.0588936890535687
1 8:1.5294468445267844 3:0.6609640474436812
1 3:1.3219280948873624 1 3:1.3219280948873624
1 3:1.3219280948873624 1 3:1.3219280948873624
1 3:1.3219280948873624 1 3:1.3219280948873624
1 8:1.5294468445267844 3:0.6609640474436812
1 13:1.8219280948873626 9:1.5294468445267844
1 3:1.3219280948873624 1 3:1.3219280948873624
1 3:1.3219280948873624 1 3:1.3219280948873624
1 5:3.6438561897747253
1 3:1.3219280948873624 1 3:1.3219280948873624
1 9:3.0588936890535687
1 3:1.3219280948873624 1 3:1.3219280948873624
1 10:3.6438561897747253
1 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745 1 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745
2 1 13:1.8219280948873626 9:1.5294468445267844
1 5:3.6438561897747253
2 14:2.643856189774725
2 11:3.0588936890535687 2 11:3.0588936890535687
2 2
2 14:1.3219280948873624 12:1.8219280948873626 2 2:3.6438561897747253
2 4:2.643856189774725
2 4:1.3219280948873624 7:1.5294468445267844
2 1:1.8219280948873626 14:1.3219280948873624 2 1:1.8219280948873626 14:1.3219280948873624
2 11:3.0588936890535687
2 14:1.3219280948873624 12:1.8219280948873626
2
2 4:2.643856189774725 2 4:2.643856189774725
2 14:2.643856189774725
2 6:3.6438561897747253 2 6:3.6438561897747253
2 4:2.643856189774725
2 2:3.6438561897747253
2 11:3.0588936890535687
2 4:1.3219280948873624 7:1.5294468445267844
...@@ -21,48 +21,28 @@ public class IctclasAnalyzer extends AbstractDocumentAnalyzer implements Documen ...@@ -21,48 +21,28 @@ public class IctclasAnalyzer extends AbstractDocumentAnalyzer implements Documen
} }
@Override @Override
public Map<String, Term> analyze(File file) { public Map<String, Term> analyze(String text) {
String doc = file.getAbsolutePath();
LOG.info("Process document: file=" + doc);
Map<String, Term> terms = new HashMap<String, Term>(0); Map<String, Term> terms = new HashMap<String, Term>(0);
BufferedReader br = null; text = text.trim();
try { if(!text.isEmpty()) {
br = new BufferedReader(new InputStreamReader(new FileInputStream(file), charSet)); List nlpSeg = WordUtil.getNlpSeg(text);
String line = null; StringBuffer content = new StringBuffer();
while((line = br.readLine()) != null) { for(Object rawWord : nlpSeg) {
line = line.trim(); String[] words = String.valueOf(rawWord).split("/");
if(!line.isEmpty()) { if(words.length == 2) {
String word = words[0];
List nlpSeg = WordUtil.getNlpSeg(line); String lexicalCategory = words[1];
StringBuffer content = new StringBuffer(); Term term = terms.get(word);
for(Object rawWord : nlpSeg) { if(term == null) {
String[] words = String.valueOf(rawWord).split("/"); term = new Term(word);
if(words.length == 2) { // TODO set lexical category
String word = words[0]; term.setLexicalCategory(lexicalCategory);
String lexicalCategory = words[1]; terms.put(word, term);
Term term = terms.get(word);
if(term == null) {
term = new Term(word);
// TODO set lexical category
term.setLexicalCategory(lexicalCategory);
terms.put(word, term);
}
term.incrFreq();
LOG.debug("Got word: word=" + rawWord);
} }
term.incrFreq();
LOG.debug("Got word: word=" + rawWord);
} }
} }
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(br != null) {
br.close();
}
} catch (IOException e) {
LOG.warn(e);
}
} }
return terms; return terms;
} }
......
...@@ -26,46 +26,31 @@ public class MMSeg4jAnalyzer extends AbstractDocumentAnalyzer implements Documen ...@@ -26,46 +26,31 @@ public class MMSeg4jAnalyzer extends AbstractDocumentAnalyzer implements Documen
} }
@Override @Override
public Map<String, Term> analyze(File file) { public Map<String, Term> analyze(String text) {
String doc = file.getAbsolutePath();
LOG.info("Process document: file=" + doc);
Map<String, Term> terms = new HashMap<String, Term>(0); Map<String, Term> terms = new HashMap<String, Term>(0);
BufferedReader br = null;
try { try {
br = new BufferedReader( StringReader reader = new StringReader(text);
new InputStreamReader(new FileInputStream(file), charSet)); TokenStream ts = analyzer.tokenStream("", reader);
String line = null; ts.addAttribute(CharTermAttribute.class);
while((line = br.readLine()) != null) { while (ts.incrementToken()) {
StringReader reader = new StringReader(line); CharTermAttributeImpl attr = (CharTermAttributeImpl) ts.getAttribute(CharTermAttribute.class);
TokenStream ts = analyzer.tokenStream("", reader); String word = attr.toString().trim();
ts.addAttribute(CharTermAttribute.class); if(!word.isEmpty() && !super.isStopword(word)) {
while (ts.incrementToken()) { Term term = terms.get(word);
CharTermAttributeImpl attr = (CharTermAttributeImpl) ts.getAttribute(CharTermAttribute.class); if(term == null) {
String word = attr.toString().trim(); term = new Term(word);
if(!word.isEmpty() && !super.isStopword(word)) {
Term term = terms.get(word);
if(term == null) {
term = new Term(word);
}
term.incrFreq();
} else {
LOG.debug("Filter out stop word: file=" + file + ", word=" + word);
} }
term.incrFreq();
} else {
LOG.debug("Filter out stop word: word=" + word);
} }
ts.close();
reader.close();
} }
ts.close();
reader.close();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} finally { } finally {
try { LOG.info("termCount=" + terms.size());
if(br != null) {
br.close();
}
} catch (IOException e) {
LOG.warn(e);
}
LOG.info("Done: file=" + file + ", termCount=" + terms.size());
} }
return terms; return terms;
} }
......
...@@ -5,5 +5,5 @@ import java.util.Map; ...@@ -5,5 +5,5 @@ import java.util.Map;
public interface DocumentAnalyzer { public interface DocumentAnalyzer {
Map<String, Term> analyze(File file); Map<String, Term> analyze(String text);
} }
...@@ -5,8 +5,7 @@ import org.apache.commons.logging.LogFactory; ...@@ -5,8 +5,7 @@ import org.apache.commons.logging.LogFactory;
import org.shirdrn.document.processor.common.AbstractComponent; import org.shirdrn.document.processor.common.AbstractComponent;
import org.shirdrn.document.processor.common.Context; import org.shirdrn.document.processor.common.Context;
import java.io.File; import java.io.*;
import java.io.FileFilter;
public class BasicInformationCollector extends AbstractComponent { public class BasicInformationCollector extends AbstractComponent {
...@@ -34,9 +33,31 @@ public class BasicInformationCollector extends AbstractComponent { ...@@ -34,9 +33,31 @@ public class BasicInformationCollector extends AbstractComponent {
return pathname.getAbsolutePath().endsWith(context.getFDMetadata().getFileExtensionName()); return pathname.getAbsolutePath().endsWith(context.getFDMetadata().getFileExtensionName());
} }
}); });
context.getVectorMetadata().putLabelledTotalDocCount(label, files.length); int totalDocLabelCount = 0;
LOG.info("Put document count: label= " + label + ", docCount=" + files.length); for(File file : files) {
totalDocCount += files.length; LineNumberReader reader = null;
FileReader in = null;
try {
in = new FileReader(file);
reader = new LineNumberReader(in);
reader.skip(Long.MAX_VALUE);
totalDocLabelCount += reader.getLineNumber();
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
try{
reader.close();
in.close();
} catch (IOException e) {
LOG.warn(e);
}
}
}
totalDocCount+=totalDocLabelCount;
context.getVectorMetadata().putLabelledTotalDocCount(label, totalDocLabelCount);
LOG.info("Put document count: label= " + label + ", docCount=" + totalDocLabelCount);
} }
LOG.info("Total documents: totalCount= " + totalDocCount); LOG.info("Total documents: totalCount= " + totalDocCount);
context.getVectorMetadata().setTotalDocCount(totalDocCount); context.getVectorMetadata().setTotalDocCount(totalDocCount);
......
...@@ -2,11 +2,13 @@ package org.shirdrn.document.processor.component; ...@@ -2,11 +2,13 @@ package org.shirdrn.document.processor.component;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.shirdrn.document.processor.common.*; import org.shirdrn.document.processor.common.*;
import org.shirdrn.document.processor.utils.ReflectionUtils; import org.shirdrn.document.processor.utils.ReflectionUtils;
import java.io.File; import java.io.*;
import java.io.FileFilter;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
...@@ -54,29 +56,46 @@ public class DocumentWordsCollector extends AbstractComponent { ...@@ -54,29 +56,46 @@ public class DocumentWordsCollector extends AbstractComponent {
} }
}); });
LOG.info("Prepare to analyze " + files.length + " files."); LOG.info("Prepare to analyze " + files.length + " files.");
int n = 0; int n = 1;
for(File file : files) { for(File file : files) {
analyze(label, file); String doc = file.getAbsolutePath();
++n; //遍历文件里的每一行内容
BufferedReader br = null;
try {
br = new BufferedReader(
new InputStreamReader(new FileInputStream(file), charSet));
String line = null;
while((line = br.readLine()) != null) {
String tempdoc = doc + "-" +n;
analyze(label, tempdoc, line);
n++;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(br != null) {
br.close();
}
} catch (IOException e) {
LOG.warn(e);
}
}
} }
LOG.info("Analyzed files: count=" + n);
} }
// output statistics // output statistics
stat(); stat();
} }
protected void analyze(String label, File file) { protected void analyze(String label, String doc ,String text) {
String doc = file.getAbsolutePath(); LOG.debug("Process document: label=" + label + ", text=" + text);
LOG.debug("Process document: label=" + label + ", file=" + doc); Map<String, Term> terms = analyzer.analyze(text);
Map<String, Term> terms = analyzer.analyze(file);
// filter terms // filter terms
filterTerms(terms); filterTerms(terms);
// construct memory structure // construct memory structure
context.getVectorMetadata().addTerms(label, doc, terms); context.getVectorMetadata().addTerms(label, doc, terms);
// add inverted table as needed // add inverted table as needed
context.getVectorMetadata().addTermsToInvertedTable(label, doc, terms); context.getVectorMetadata().addTermsToInvertedTable(label, doc, terms);
LOG.debug("Done: file=" + file + ", termCount=" + terms.size());
LOG.debug("Terms in a doc: terms=" + terms);
} }
protected void filterTerms(Map<String, Term> terms) { protected void filterTerms(Map<String, Term> terms) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment