修改语料文件加载代码，语料支持放在同一个文件

d80c569e · liuzhangyiding · 04a33d18 · d80c569e · 04a33d18 · 04a33d18
Commit d80c569e authored Mar 18, 2020 by liuzhangyiding
34 changed files
--- a/corpus/外卖/0.txt
+++ b/corpus/外卖/0.txt
 吃米饭
 吃米饭
\ No newline at end of file
+叫个外卖
+叫一份外卖
+吃杂酱面
+叫个外卖
+吃午饭
+点个外卖
+叫个外卖
+订个外卖
+需要一份外卖
+点个米饭
+叫个外卖
+点个外卖
\ No newline at end of file
--- a/corpus/外卖/1.txt
+++ b/corpus/外卖/1.txt
-点个外卖
++ /dev/null
-点个外卖
\ No newline at end of file
--- a/corpus/外卖/10.txt
+++ b/corpus/外卖/10.txt
-吃杂酱面
++ /dev/null
-吃杂酱面
\ No newline at end of file
--- a/corpus/外卖/11.txt
+++ b/corpus/外卖/11.txt
-叫一份外卖
++ /dev/null
-叫一份外卖
\ No newline at end of file
--- a/corpus/外卖/12.txt
+++ b/corpus/外卖/12.txt
-叫个外卖
++ /dev/null
-叫个外卖
\ No newline at end of file
--- a/corpus/外卖/2.txt
+++ b/corpus/外卖/2.txt
-叫个外卖
++ /dev/null
-叫个外卖
\ No newline at end of file
--- a/corpus/外卖/3.txt
+++ b/corpus/外卖/3.txt
-点个米饭
++ /dev/null
-点个米饭
\ No newline at end of file
--- a/corpus/外卖/4.txt
+++ b/corpus/外卖/4.txt
-需要一份外卖
++ /dev/null
-需要一份外卖
\ No newline at end of file
--- a/corpus/外卖/5.txt
+++ b/corpus/外卖/5.txt
-订个外卖
++ /dev/null
-订个外卖
\ No newline at end of file
--- a/corpus/外卖/6.txt
+++ b/corpus/外卖/6.txt
-叫个外卖
++ /dev/null
-叫个外卖
\ No newline at end of file
--- a/corpus/外卖/7.txt
+++ b/corpus/外卖/7.txt
-点个外卖
++ /dev/null
-点个外卖
\ No newline at end of file
--- a/corpus/外卖/8.txt
+++ b/corpus/外卖/8.txt
-吃午饭
++ /dev/null
-吃午饭
\ No newline at end of file
--- a/corpus/外卖/9.txt
+++ b/corpus/外卖/9.txt
-叫个外卖
++ /dev/null
-叫个外卖
\ No newline at end of file
--- a/corpus/打车/0.txt
+++ b/corpus/打车/0.txt
 打车
 打车
\ No newline at end of file
+叫一辆车
+叫个出租
+打车去体育场
+叫辆车
+坐车
+叫辆出租车
+打一辆车
+叫个车
+需要一辆车
+打车去大雁塔
+叫个车
\ No newline at end of file
--- a/corpus/打车/1.txt
+++ b/corpus/打车/1.txt
-叫一辆车
++ /dev/null
-叫一辆车
\ No newline at end of file
--- a/corpus/打车/10.txt
+++ b/corpus/打车/10.txt
-叫个车
++ /dev/null
-叫个车
\ No newline at end of file
--- a/corpus/打车/11.txt
+++ b/corpus/打车/11.txt
-打车去大雁塔
++ /dev/null
-打车去大雁塔
\ No newline at end of file
--- a/corpus/打车/2.txt
+++ b/corpus/打车/2.txt
-叫个出租
++ /dev/null
-叫个出租
\ No newline at end of file
--- a/corpus/打车/3.txt
+++ b/corpus/打车/3.txt
-打车去体育场
++ /dev/null
-打车去体育场
\ No newline at end of file
--- a/corpus/打车/4.txt
+++ b/corpus/打车/4.txt
-叫辆车
++ /dev/null
-叫辆车
\ No newline at end of file
--- a/corpus/打车/5.txt
+++ b/corpus/打车/5.txt
-坐车
++ /dev/null
-坐车
\ No newline at end of file
--- a/corpus/打车/6.txt
+++ b/corpus/打车/6.txt
-叫辆出租车
++ /dev/null
-叫辆出租车
\ No newline at end of file
--- a/corpus/打车/7.txt
+++ b/corpus/打车/7.txt
-打一辆车
++ /dev/null
-打一辆车
\ No newline at end of file
--- a/corpus/打车/8.txt
+++ b/corpus/打车/8.txt
-叫个车
++ /dev/null
-叫个车
\ No newline at end of file
--- a/corpus/打车/9.txt
+++ b/corpus/打车/9.txt
-需要一辆车
++ /dev/null
-需要一辆车
\ No newline at end of file
--- a/model/TestResult.txt
+++ b/model/TestResult.txt
@@ -11,15 +11,15 @@
 1.0
 1.0
 1.0
-1.0
 2.0
-1.0
 2.0
+1.0
 2.0
 2.0
 2.0
 2.0
 2.0
 2.0
+1.0
 2.0
 2.0
--- a/model/model.txt
+++ b/model/model.txt
@@ -3,29 +3,29 @@ kernel_type rbf
 gamma 0.07142857142857142
 nr_class 2
 total_sv 22
-rho 0.3543789360023631
+rho 0.3544510261690456
 label 1 2
 nr_sv 10 12
 SV
-1.0 10:3.6438561897747253 
+1.0 9:3.0588936890535687 
 1.0 8:1.5294468445267844 3:0.6609640474436812 
-1.0 13:1.8219280948873626 9:1.5294468445267844 
+0.9903141391672972 3:1.3219280948873624 
-0.9893662802090073 3:1.3219280948873624 
 1.0 3:1.3219280948873624 
-1.0 5:3.6438561897747253 
 1.0 3:1.3219280948873624 
-1.0 9:3.0588936890535687 
 1.0 3:1.3219280948873624 
+1.0 10:3.6438561897747253 
 1.0 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745 
-1.0 
+1.0 13:1.8219280948873626 9:1.5294468445267844 
+1.0 5:3.6438561897747253 
+-0.6700922884685501 14:2.643856189774725 
 -0.2563835238113437 11:3.0588936890535687 
 -1.0 
-1.0 14:1.3219280948873624 12:1.8219280948873626 
-1.0 1:1.8219280948873626 14:1.3219280948873624 
-0.07411630357738502 4:2.643856189774725 
-0.6694907707158689 14:2.643856189774725 
-1.0 6:3.6438561897747253 
-0.9972716531392096 4:2.643856189774725 
 -1.0 2:3.6438561897747253 
-0.9921040289652 11:3.0588936890535687 
+-0.07237134131172171 4:2.643856189774725 
 -1.0 4:1.3219280948873624 7:1.5294468445267844 
+-1.0 1:1.8219280948873626 14:1.3219280948873624 
+-0.9914669855756817 11:3.0588936890535687 
+-1.0 14:1.3219280948873624 12:1.8219280948873626 
+-1.0 
+-1.0 4:2.643856189774725 
+-1.0 6:3.6438561897747253 
--- a/model/train.txt
+++ b/model/train.txt
-1 10:3.6438561897747253
+1 9:3.0588936890535687
+1 8:1.5294468445267844 3:0.6609640474436812
 1 3:1.3219280948873624
 1 3:1.3219280948873624
 1 3:1.3219280948873624
-1 8:1.5294468445267844 3:0.6609640474436812
-1 13:1.8219280948873626 9:1.5294468445267844
 1 3:1.3219280948873624
 1 3:1.3219280948873624
-1 5:3.6438561897747253
 1 3:1.3219280948873624
-1 9:3.0588936890535687
 1 3:1.3219280948873624
+1 10:3.6438561897747253
 1 8:1.019631229684523 7:1.019631229684523 3:0.44064269829578745
-2
+1 13:1.8219280948873626 9:1.5294468445267844
+1 5:3.6438561897747253
+2 14:2.643856189774725
 2 11:3.0588936890535687
 2
-2 14:1.3219280948873624 12:1.8219280948873626
+2 2:3.6438561897747253
+2 4:2.643856189774725
+2 4:1.3219280948873624 7:1.5294468445267844
 2 1:1.8219280948873626 14:1.3219280948873624
+2 11:3.0588936890535687
+2 14:1.3219280948873624 12:1.8219280948873626
+2
 2 4:2.643856189774725
-2 14:2.643856189774725
 2 6:3.6438561897747253
-2 4:2.643856189774725
-2 2:3.6438561897747253
-2 11:3.0588936890535687
-2 4:1.3219280948873624 7:1.5294468445267844
--- a/src/main/java/org/shirdrn/document/processor/analyzer/IctclasAnalyzer.java
+++ b/src/main/java/org/shirdrn/document/processor/analyzer/IctclasAnalyzer.java
@@ -21,48 +21,28 @@ public class IctclasAnalyzer extends AbstractDocumentAnalyzer implements Documen
 	}
 	@Override
-	public Map<String, Term> analyze(File file) {
+	public Map<String, Term> analyze(String text) {
-		String doc = file.getAbsolutePath();
-		LOG.info("Process document: file=" + doc);
 		Map<String, Term> terms = new HashMap<String, Term>(0);
-		BufferedReader br = null;
+		text = text.trim();
-		try {
+			if(!text.isEmpty()) {
-			br = new BufferedReader(new InputStreamReader(new FileInputStream(file), charSet));
+				List nlpSeg = WordUtil.getNlpSeg(text);
-			String line = null;
+				StringBuffer content = new StringBuffer();
-			while((line = br.readLine()) != null) {
+				for(Object rawWord : nlpSeg) {
-				line = line.trim();
+					String[] words = String.valueOf(rawWord).split("/");
-				if(!line.isEmpty()) {
+					if(words.length == 2) {
+						String word = words[0];
-					List nlpSeg = WordUtil.getNlpSeg(line);
+						String lexicalCategory = words[1];
-					StringBuffer content = new StringBuffer();
+						Term term = terms.get(word);
-					for(Object rawWord : nlpSeg) {
+						if(term == null) {
-						String[] words = String.valueOf(rawWord).split("/");
+							term = new Term(word);
-						if(words.length == 2) {
+							// TODO set lexical category
-							String word = words[0];
+							term.setLexicalCategory(lexicalCategory);
-							String lexicalCategory = words[1];
+							terms.put(word, term);
-							Term term = terms.get(word);
-							if(term == null) {
-								term = new Term(word);
-								// TODO set lexical category
-								term.setLexicalCategory(lexicalCategory);
-								terms.put(word, term);
-							}
-							term.incrFreq();
-							LOG.debug("Got word: word=" + rawWord);
 						}
+						term.incrFreq();
+						LOG.debug("Got word: word=" + rawWord);
 					}
 				}
-			}
-		} catch (IOException e) {
-			e.printStackTrace();
-		} finally {
-			try {
-				if(br != null) {
-					br.close();
-				}
-			} catch (IOException e) {
-				LOG.warn(e);
-			}
 		}
 		return terms;
 	}

--- a/src/main/java/org/shirdrn/document/processor/analyzer/MMSeg4jAnalyzer.java
+++ b/src/main/java/org/shirdrn/document/processor/analyzer/MMSeg4jAnalyzer.java
@@ -26,46 +26,31 @@ public class MMSeg4jAnalyzer extends AbstractDocumentAnalyzer implements Documen
 	}
 	@Override
-	public Map<String, Term> analyze(File file) {
+	public Map<String, Term> analyze(String text) {
-		String doc = file.getAbsolutePath();
-		LOG.info("Process document: file=" + doc);
 		Map<String, Term> terms = new HashMap<String, Term>(0);
-		BufferedReader br = null;
 		try {
-			br = new BufferedReader(
+			StringReader reader = new StringReader(text);
-					new InputStreamReader(new FileInputStream(file), charSet));
+			TokenStream ts = analyzer.tokenStream("", reader);
-			String line = null;
+			ts.addAttribute(CharTermAttribute.class);
-			while((line = br.readLine()) != null) {
+			while (ts.incrementToken()) {
-				StringReader reader = new StringReader(line);
+				CharTermAttributeImpl attr = (CharTermAttributeImpl) ts.getAttribute(CharTermAttribute.class);
-				TokenStream ts = analyzer.tokenStream("", reader);
+				String word = attr.toString().trim();
-				ts.addAttribute(CharTermAttribute.class); 
+				if(!word.isEmpty() && !super.isStopword(word)) {
-				while (ts.incrementToken()) {  
+					Term term = terms.get(word);
-					CharTermAttributeImpl attr = (CharTermAttributeImpl) ts.getAttribute(CharTermAttribute.class);  
+					if(term == null) {
-					String word = attr.toString().trim();
+						term = new Term(word);
-					if(!word.isEmpty() && !super.isStopword(word)) {
-						Term term = terms.get(word);
-						if(term == null) {
-							term = new Term(word);
-						}
-						term.incrFreq();
-					} else {
-						LOG.debug("Filter out stop word: file=" + file + ", word=" + word);
 					}
+					term.incrFreq();
+				} else {
+					LOG.debug("Filter out stop word:  word=" + word);
 				}
-				ts.close();
-				reader.close();
 			}
+			ts.close();
+			reader.close();
 		} catch (IOException e) {
 			e.printStackTrace();
 		} finally {
-			try {
+			LOG.info("termCount=" + terms.size());
-				if(br != null) {
-					br.close();
-				}
-			} catch (IOException e) {
-				LOG.warn(e);
-			}
-			LOG.info("Done: file=" + file + ", termCount=" + terms.size());
 		}
 		return terms;
 	}

--- a/src/main/java/org/shirdrn/document/processor/common/DocumentAnalyzer.java
+++ b/src/main/java/org/shirdrn/document/processor/common/DocumentAnalyzer.java
@@ -5,5 +5,5 @@ import java.util.Map;
 public interface DocumentAnalyzer {
-	Map<String, Term> analyze(File file);
+	Map<String, Term> analyze(String text);
 }
--- a/src/main/java/org/shirdrn/document/processor/component/BasicInformationCollector.java
+++ b/src/main/java/org/shirdrn/document/processor/component/BasicInformationCollector.java
@@ -5,8 +5,7 @@ import org.apache.commons.logging.LogFactory;
 import org.shirdrn.document.processor.common.AbstractComponent;
 import org.shirdrn.document.processor.common.Context;
-import java.io.File;
+import java.io.*;
-import java.io.FileFilter;
 public class BasicInformationCollector extends AbstractComponent {
@@ -34,9 +33,31 @@ public class BasicInformationCollector extends AbstractComponent {
 					return pathname.getAbsolutePath().endsWith(context.getFDMetadata().getFileExtensionName());
 				}
 			});
-			context.getVectorMetadata().putLabelledTotalDocCount(label, files.length);
+			int totalDocLabelCount = 0;
-			LOG.info("Put document count: label= " + label + ", docCount=" + files.length);
+			for(File file : files) {
-			totalDocCount += files.length;
+				LineNumberReader reader = null;
+				FileReader in = null;
+				try {
+					in = new FileReader(file);
+					reader = new LineNumberReader(in);
+					reader.skip(Long.MAX_VALUE);
+					totalDocLabelCount += reader.getLineNumber();
+					reader.close();
+				} catch (IOException e) {
+					e.printStackTrace();
+				} finally {
+					try{
+						reader.close();
+						in.close();
+					} catch (IOException e) {
+						LOG.warn(e);
+					}
+				}
+			}
+			totalDocCount+=totalDocLabelCount;
+			context.getVectorMetadata().putLabelledTotalDocCount(label, totalDocLabelCount);
+			LOG.info("Put document count: label= " + label + ", docCount=" + totalDocLabelCount);
 		}
 		LOG.info("Total documents: totalCount= " + totalDocCount);
 		context.getVectorMetadata().setTotalDocCount(totalDocCount);

--- a/src/main/java/org/shirdrn/document/processor/component/DocumentWordsCollector.java
+++ b/src/main/java/org/shirdrn/document/processor/component/DocumentWordsCollector.java
@@ -2,11 +2,13 @@ package org.shirdrn.document.processor.component;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
 import org.shirdrn.document.processor.common.*;
 import org.shirdrn.document.processor.utils.ReflectionUtils;
-import java.io.File;
+import java.io.*;
-import java.io.FileFilter;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
@@ -54,29 +56,46 @@ public class DocumentWordsCollector extends AbstractComponent {
 				}
 			});
 			LOG.info("Prepare to analyze " + files.length + " files.");
-			int n = 0;
+			int n = 1;
 			for(File file : files) {
-				analyze(label, file);
+				String doc = file.getAbsolutePath();
-				++n;
+				//遍历文件里的每一行内容
+				BufferedReader br = null;
+				try {
+					br = new BufferedReader(
+							new InputStreamReader(new FileInputStream(file), charSet));
+					String line = null;
+					while((line = br.readLine()) != null) {
+						String tempdoc = doc + "-" +n;
+						analyze(label, tempdoc, line);
+						n++;
+					}
+				} catch (IOException e) {
+					e.printStackTrace();
+				} finally {
+					try {
+						if(br != null) {
+							br.close();
+						}
+					} catch (IOException e) {
+						LOG.warn(e);
+					}
+				}
 			}
-			LOG.info("Analyzed files: count=" + n);
 		}
 		// output statistics
 		stat();
 	}
-	protected void analyze(String label, File file) {
+	protected void analyze(String label, String doc ,String text) {
-		String doc = file.getAbsolutePath();
+		LOG.debug("Process document: label=" + label + ", text=" + text);
-		LOG.debug("Process document: label=" + label + ", file=" + doc);
+		Map<String, Term> terms = analyzer.analyze(text);
-		Map<String, Term> terms = analyzer.analyze(file);
 		// filter terms
 		filterTerms(terms);
 		// construct memory structure
 		context.getVectorMetadata().addTerms(label, doc, terms);
 		// add inverted table as needed
 		context.getVectorMetadata().addTermsToInvertedTable(label, doc, terms);
-		LOG.debug("Done: file=" + file + ", termCount=" + terms.size());
-		LOG.debug("Terms in a doc: terms=" + terms);
 	}
 	protected void filterTerms(Map<String, Term> terms) {

--- a/test_file.txt
+++ b/test_file.txt
+1.0