/*
 * Decompiled with CFR 0.152.
 */
package se.lth.cs.srl.preprocessor.tokenization;

import edu.stanford.nlp.ie.crf.CRFClassifier;
import java.io.File;
import java.util.Properties;
import se.lth.cs.srl.preprocessor.tokenization.Tokenizer;
import se.lth.cs.srl.util.FileExistenceVerifier;

public class StanfordChineseSegmenterWrapper
implements Tokenizer {
    CRFClassifier classifier;

    public StanfordChineseSegmenterWrapper(File dataDir) {
        File serDictionaryFile = new File(dataDir, "dict-chris6.ser.gz");
        File ctbFile = new File(dataDir, "ctb.gz");
        String error = FileExistenceVerifier.verifyFiles(serDictionaryFile, ctbFile);
        if (error != null) {
            throw new Error(error);
        }
        Properties props = new Properties();
        props.setProperty("sighanCorporaDict", dataDir.toString());
        props.setProperty("serDictionary", serDictionaryFile.toString());
        props.setProperty("inputEncoding", "UTF-8");
        props.setProperty("sighanPostProcessing", "true");
        this.classifier = new CRFClassifier(props);
        this.classifier.loadClassifierNoExceptions(ctbFile.toString(), props);
        this.classifier.flags.setProperties(props);
    }

    @Override
    public String[] tokenize(String sentence) {
        String[] tokens = (String[])this.classifier.segmentString(sentence).toArray();
        String[] withRoot = new String[tokens.length + 1];
        withRoot[0] = "<root>";
        System.arraycopy(tokens, 0, withRoot, 1, tokens.length);
        return withRoot;
    }
}

