Commit d80c569e by liuzhangyiding

修改语料文件加载代码,语料支持放在同一个文件

parent 04a33d18
吃米饭
吃米饭
吃米饭
叫个外卖
叫一份外卖
吃杂酱面
叫个外卖
吃午饭
点个外卖
叫个外卖
订个外卖
需要一份外卖
点个米饭
叫个外卖
点个外卖
\ No newline at end of file
点个外卖
++ /dev/null
点个外卖
\ No newline at end of file
吃杂酱面
++ /dev/null
吃杂酱面
\ No newline at end of file
叫一份外卖
++ /dev/null
叫一份外卖
\ No newline at end of file
叫个外卖
++ /dev/null
叫个外卖
\ No newline at end of file
叫个外卖
++ /dev/null
叫个外卖
\ No newline at end of file
点个米饭
++ /dev/null
点个米饭
\ No newline at end of file
需要一份外卖
++ /dev/null
需要一份外卖
\ No newline at end of file
订个外卖
++ /dev/null
订个外卖
\ No newline at end of file
叫个外卖
++ /dev/null
叫个外卖
\ No newline at end of file
点个外卖
++ /dev/null
点个外卖
\ No newline at end of file
吃午饭
++ /dev/null
吃午饭
\ No newline at end of file
叫个外卖
++ /dev/null
叫个外卖
\ No newline at end of file
打车
打车
打车
叫一辆车
叫个出租
打车去体育场
叫辆车
坐车
叫辆出租车
打一辆车
叫个车
需要一辆车
打车去大雁塔
叫个车
\ No newline at end of file
叫一辆车
++ /dev/null
叫一辆车
\ No newline at end of file
叫个车
++ /dev/null
叫个车
\ No newline at end of file
打车去大雁塔
++ /dev/null
打车去大雁塔
\ No newline at end of file
叫个出租
++ /dev/null
叫个出租
\ No newline at end of file
打车去体育场
++ /dev/null
打车去体育场
\ No newline at end of file
叫辆车
++ /dev/null
叫辆车
\ No newline at end of file
坐车
++ /dev/null
坐车
\ No newline at end of file
叫辆出租车
++ /dev/null
叫辆出租车
\ No newline at end of file
打一辆车
++ /dev/null
打一辆车
\ No newline at end of file
叫个车
++ /dev/null
叫个车
\ No newline at end of file
需要一辆车
++ /dev/null
需要一辆车
\ No newline at end of file
......@@ -11,15 +11,15 @@
1.0
1.0
1.0
1.0
2.0
1.0
2.0
1.0
2.0
2.0
2.0
2.0
2.0
2.0
1.0
2.0
2.0
......@@ -3,29 +3,29 @@ kernel_type rbf
gamma 0.07142857142857142
nr_class 2
total_sv 22
rho 0.3543789360023631
rho 0.3544510261690456
label 1 2
nr_sv 10 12
SV
1.0 10:3.6438561897747253
1.0 9:3.0588936890535687
1.0 8:1.5294468445267844 3:0.6609640474436812
1.0 13:1.8219280948873626 9:1.5294468445267844
0.9893662802090073 3:1.3219280948873624
0.9903141391672972 3:1.3219280948873624
1.0 3:1.3219280948873624
1.0 5:3.6438561897747253
1.0 3:1.3219280948873624
1.0 9:3.0588936890535687
1.0 3:1.3219280948873624
1.0 10:3.6438561897747253
1.0 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745
-1.0
1.0 13:1.8219280948873626 9:1.5294468445267844
1.0 5:3.6438561897747253
-0.6700922884685501 14:2.643856189774725
-0.2563835238113437 11:3.0588936890535687
-1.0
-1.0 14:1.3219280948873624 12:1.8219280948873626
-1.0 1:1.8219280948873626 14:1.3219280948873624
-0.07411630357738502 4:2.643856189774725
-0.6694907707158689 14:2.643856189774725
-1.0 6:3.6438561897747253
-0.9972716531392096 4:2.643856189774725
-1.0 2:3.6438561897747253
-0.9921040289652 11:3.0588936890535687
-0.07237134131172171 4:2.643856189774725
-1.0 4:1.3219280948873624 7:1.5294468445267844
-1.0 1:1.8219280948873626 14:1.3219280948873624
-0.9914669855756817 11:3.0588936890535687
-1.0 14:1.3219280948873624 12:1.8219280948873626
-1.0
-1.0 4:2.643856189774725
-1.0 6:3.6438561897747253
1 10:3.6438561897747253
1 9:3.0588936890535687
1 8:1.5294468445267844 3:0.6609640474436812
1 3:1.3219280948873624
1 3:1.3219280948873624
1 3:1.3219280948873624
1 8:1.5294468445267844 3:0.6609640474436812
1 13:1.8219280948873626 9:1.5294468445267844
1 3:1.3219280948873624
1 3:1.3219280948873624
1 5:3.6438561897747253
1 3:1.3219280948873624
1 9:3.0588936890535687
1 3:1.3219280948873624
1 10:3.6438561897747253
1 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745
2
1 13:1.8219280948873626 9:1.5294468445267844
1 5:3.6438561897747253
2 14:2.643856189774725
2 11:3.0588936890535687
2
2 14:1.3219280948873624 12:1.8219280948873626
2 2:3.6438561897747253
2 4:2.643856189774725
2 4:1.3219280948873624 7:1.5294468445267844
2 1:1.8219280948873626 14:1.3219280948873624
2 11:3.0588936890535687
2 14:1.3219280948873624 12:1.8219280948873626
2
2 4:2.643856189774725
2 14:2.643856189774725
2 6:3.6438561897747253
2 4:2.643856189774725
2 2:3.6438561897747253
2 11:3.0588936890535687
2 4:1.3219280948873624 7:1.5294468445267844
......@@ -21,19 +21,11 @@ public class IctclasAnalyzer extends AbstractDocumentAnalyzer implements Documen
}
@Override
public Map<String, Term> analyze(File file) {
String doc = file.getAbsolutePath();
LOG.info("Process document: file=" + doc);
public Map<String, Term> analyze(String text) {
Map<String, Term> terms = new HashMap<String, Term>(0);
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(file), charSet));
String line = null;
while((line = br.readLine()) != null) {
line = line.trim();
if(!line.isEmpty()) {
List nlpSeg = WordUtil.getNlpSeg(line);
text = text.trim();
if(!text.isEmpty()) {
List nlpSeg = WordUtil.getNlpSeg(text);
StringBuffer content = new StringBuffer();
for(Object rawWord : nlpSeg) {
String[] words = String.valueOf(rawWord).split("/");
......@@ -52,18 +44,6 @@ public class IctclasAnalyzer extends AbstractDocumentAnalyzer implements Documen
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(br != null) {
br.close();
}
} catch (IOException e) {
LOG.warn(e);
}
}
return terms;
}
......
......@@ -26,17 +26,10 @@ public class MMSeg4jAnalyzer extends AbstractDocumentAnalyzer implements Documen
}
@Override
public Map<String, Term> analyze(File file) {
String doc = file.getAbsolutePath();
LOG.info("Process document: file=" + doc);
public Map<String, Term> analyze(String text) {
Map<String, Term> terms = new HashMap<String, Term>(0);
BufferedReader br = null;
try {
br = new BufferedReader(
new InputStreamReader(new FileInputStream(file), charSet));
String line = null;
while((line = br.readLine()) != null) {
StringReader reader = new StringReader(line);
StringReader reader = new StringReader(text);
TokenStream ts = analyzer.tokenStream("", reader);
ts.addAttribute(CharTermAttribute.class);
while (ts.incrementToken()) {
......@@ -49,23 +42,15 @@ public class MMSeg4jAnalyzer extends AbstractDocumentAnalyzer implements Documen
}
term.incrFreq();
} else {
LOG.debug("Filter out stop word: file=" + file + ", word=" + word);
LOG.debug("Filter out stop word: word=" + word);
}
}
ts.close();
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(br != null) {
br.close();
}
} catch (IOException e) {
LOG.warn(e);
}
LOG.info("Done: file=" + file + ", termCount=" + terms.size());
LOG.info("termCount=" + terms.size());
}
return terms;
}
......
......@@ -5,5 +5,5 @@ import java.util.Map;
public interface DocumentAnalyzer {
Map<String, Term> analyze(File file);
Map<String, Term> analyze(String text);
}
......@@ -5,8 +5,7 @@ import org.apache.commons.logging.LogFactory;
import org.shirdrn.document.processor.common.AbstractComponent;
import org.shirdrn.document.processor.common.Context;
import java.io.File;
import java.io.FileFilter;
import java.io.*;
public class BasicInformationCollector extends AbstractComponent {
......@@ -34,9 +33,31 @@ public class BasicInformationCollector extends AbstractComponent {
return pathname.getAbsolutePath().endsWith(context.getFDMetadata().getFileExtensionName());
}
});
context.getVectorMetadata().putLabelledTotalDocCount(label, files.length);
LOG.info("Put document count: label= " + label + ", docCount=" + files.length);
totalDocCount += files.length;
int totalDocLabelCount = 0;
for(File file : files) {
LineNumberReader reader = null;
FileReader in = null;
try {
in = new FileReader(file);
reader = new LineNumberReader(in);
reader.skip(Long.MAX_VALUE);
totalDocLabelCount += reader.getLineNumber();
reader.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
try{
reader.close();
in.close();
} catch (IOException e) {
LOG.warn(e);
}
}
}
totalDocCount+=totalDocLabelCount;
context.getVectorMetadata().putLabelledTotalDocCount(label, totalDocLabelCount);
LOG.info("Put document count: label= " + label + ", docCount=" + totalDocLabelCount);
}
LOG.info("Total documents: totalCount= " + totalDocCount);
context.getVectorMetadata().setTotalDocCount(totalDocCount);
......
......@@ -2,11 +2,13 @@ package org.shirdrn.document.processor.component;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
import org.shirdrn.document.processor.common.*;
import org.shirdrn.document.processor.utils.ReflectionUtils;
import java.io.File;
import java.io.FileFilter;
import java.io.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
......@@ -54,29 +56,46 @@ public class DocumentWordsCollector extends AbstractComponent {
}
});
LOG.info("Prepare to analyze " + files.length + " files.");
int n = 0;
int n = 1;
for(File file : files) {
analyze(label, file);
++n;
String doc = file.getAbsolutePath();
//遍历文件里的每一行内容
BufferedReader br = null;
try {
br = new BufferedReader(
new InputStreamReader(new FileInputStream(file), charSet));
String line = null;
while((line = br.readLine()) != null) {
String tempdoc = doc + "-" +n;
analyze(label, tempdoc, line);
n++;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if(br != null) {
br.close();
}
} catch (IOException e) {
LOG.warn(e);
}
}
}
LOG.info("Analyzed files: count=" + n);
}
// output statistics
stat();
}
protected void analyze(String label, File file) {
String doc = file.getAbsolutePath();
LOG.debug("Process document: label=" + label + ", file=" + doc);
Map<String, Term> terms = analyzer.analyze(file);
protected void analyze(String label, String doc ,String text) {
LOG.debug("Process document: label=" + label + ", text=" + text);
Map<String, Term> terms = analyzer.analyze(text);
// filter terms
filterTerms(terms);
// construct memory structure
context.getVectorMetadata().addTerms(label, doc, terms);
// add inverted table as needed
context.getVectorMetadata().addTermsToInvertedTable(label, doc, terms);
LOG.debug("Done: file=" + file + ", termCount=" + terms.size());
LOG.debug("Terms in a doc: terms=" + terms);
}
protected void filterTerms(Map<String, Term> terms) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment