欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

实操指南:Java CharsetDetector 类的运用实例

最编程 2024-02-11 15:27:46
...

实例1: checkCharset

import com.ibm.icu.text.CharsetDetector; //导入依赖的package包/类
public static CharsetMatch checkCharset(InputStream input) {
	//		BufferedInputStream bis = new BufferedInputStream(input);
	CharsetDetector cd = new CharsetDetector();
	try {
		cd.setText(input);
	} catch (IOException e) {
		try {
			input.close();
		} catch (IOException e1) {
			e1.printStackTrace();
		}
		e.printStackTrace();
	}
	CharsetMatch cm = cd.detect();

	//		if (cm != null) {
	//			//reader = cm.getReader();
	//			return cm.getName();
	//		} else {
	//			throw new UnsupportedCharsetException(null);
	//		}
	return cm;
}
 

实例2: detectEncoding

import com.ibm.icu.text.CharsetDetector; //导入依赖的package包/类
/**
     * 利用 icu4j 探测输入流编码,只能探测文本类型的输入流
     * -
     * 抛弃 juniversalchardet
     *
     * @param in
     * @return
     * @throws IOException
     */
    public static Charset detectEncoding(InputStream in) throws IOException {
        final CharsetDetector detector = new CharsetDetector();
        detector.setText(in);

        final CharsetMatch charsetMatch = detector.detect();
        if (charsetMatch == null) {
            log.info("Cannot detect source charset.");
            return null;
        }
        //This is an integer from 0 to 100. The higher the value, the more confidence
        //探测的相似度在 1~100 之间,相似度越高结果越准确。
        int confidence = charsetMatch.getConfidence();
        final String name = charsetMatch.getName();
        log.info("CharsetMatch: {} ({}% 相似度,相似度小于 50% 时,可能编码无法判断。)", name, confidence);
        //打印该文本编码,所有可能性
//        CharsetMatch[] matches = detector.detectAll();
//        System.out.println("All possibilities : " + Arrays.asList(matches));
        return Charset.forName(name);
    }
 

实例3: getText

import com.ibm.icu.text.CharsetDetector; //导入依赖的package包/类
/**
 * Extract text to be indexed
 */
public static String getText(String mimeType, String encoding, InputStream isContent) throws IOException {
	BufferedInputStream bis = new BufferedInputStream(isContent);
	TextExtractor te = engine.get(mimeType);
	String text = null;

	if (te != null) {
		if (mimeType.startsWith("text/") && encoding == null) {
			CharsetDetector detector = new CharsetDetector();
			detector.setText(bis);
			CharsetMatch cm = detector.detect();
			encoding = cm.getName();
		}

		text = te.extractText(bis, mimeType, encoding);
	} else {
		throw new IOException("Full text indexing of '" + mimeType + "' is not supported");
	}


	IOUtils.closeQuietly(bis);
	return text;
}
 

实例4: showEncode

import com.ibm.icu.text.CharsetDetector; //导入依赖的package包/类
protected String showEncode(Document doc) {
  String charsetName = "";
  try {
    String convertedPlainText = doc.getText(0, doc.getLength());
    try (InputStream is = convertStringToStream(convertedPlainText)) {
      CharsetMatch charsetMatch = new CharsetDetector().setText(is).detect();
      charsetName = charsetMatch.getName();
      charsetName = charsetName != null ? charsetName : "NULL";
      if (isPoorMatch(charsetMatch.getConfidence())) {
        charsetName = verifyPossibleUtf8(charsetName, is);
      }
      charsetName += showByteOfMark(is);
    }
  } catch (BadLocationException | IOException ex) {
    Exceptions.printStackTrace(ex);
  }
  return charsetName;
}
 

实例5: fileAnyEncodingToString

import com.ibm.icu.text.CharsetDetector; //导入依赖的package包/类
/**
 * Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection
 * Return the file contents as a String.
 */
public static String fileAnyEncodingToString(File f) throws IOException {

  byte[] byteData = IOUtils.toByteArray(new FileInputStream(f));

  CharsetDetector detector = new CharsetDetector();

  String unicodeData = detector.getString(byteData, null);
  // Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF
  unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator");
  CharsetMatch match = detector.detect();
  if (match != null && match.getConfidence() > 60) {
    LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName());
    if (match.getLanguage() != null) {
      LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage());
    }
  }
  return unicodeData;
}