/*
 * Decompiled with CFR 0.152.
 */
package it.uniroma1.lcl.jlt.bnc;

import edu.stanford.nlp.ling.WordLemmaTag;
import it.uniroma1.lcl.jlt.Configuration;
import it.uniroma1.lcl.jlt.bnc.BncIterator;
import it.uniroma1.lcl.jlt.bnc.BncSentence;
import it.uniroma1.lcl.jlt.bnc.BncText;
import it.uniroma1.lcl.jlt.bnc.Cooccurrences;
import it.uniroma1.lcl.jlt.matrix.JLTMatrix;
import it.uniroma1.lcl.jlt.matrix.SentenceToVector;
import it.uniroma1.lcl.jlt.matrix.VectorComposition;
import it.uniroma1.lcl.jlt.pipeline.stanford.StanfordSentence;
import it.uniroma1.lcl.jlt.util.DoubleCounter;
import it.uniroma1.lcl.jlt.util.Files;
import it.uniroma1.lcl.jlt.util.IntegerCounter;
import it.uniroma1.lcl.jlt.util.Maths;
import it.uniroma1.lcl.jlt.util.Timer;
import it.uniroma1.lcl.jlt.util.UnorderedPair;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jblas.DoubleMatrix;

public class Bnc
implements Iterable<BncText>,
FilenameFilter {
    private static final Log log = LogFactory.getLog(Bnc.class);
    protected static List<File> files = Files.listFiles(new File(Configuration.getInstance().getBncDataInputDir()), Bnc.getInstance(), true);
    protected static List<String> lexicon;
    protected static DoubleMatrix contextMatrix;
    private static Bnc instance;

    static {
        instance = null;
    }

    public static Bnc getInstance() {
        if (instance == null) {
            instance = new Bnc();
        }
        return instance;
    }

    public static List<File> files() {
        return files;
    }

    @Override
    public boolean accept(File dir, String name) {
        return name.endsWith(".xml");
    }

    public static int filesNumber() {
        return files.size();
    }

    @Override
    public Iterator<BncText> iterator() {
        return new BncIterator();
    }

    public DoubleMatrix getContextMatrixFromFile(String filename) {
        if (contextMatrix == null) {
            try {
                contextMatrix = new DoubleMatrix();
                contextMatrix.load(filename);
            }
            catch (IOException e) {
                e.printStackTrace();
            }
        }
        return contextMatrix;
    }

    public Set<String> getLexicon(LEXICON lexicon) {
        switch (lexicon) {
            case CONTENT_WORDS: {
                return this.getLexicon("n", "v", "a", "r");
            }
            case ANY: {
                return this.getLexicon(new String[0]);
            }
        }
        return null;
    }

    public Set<String> getLexicon(String ... poses) {
        HashSet<String> lexicon = new HashSet<String>();
        for (BncText text : Bnc.getInstance()) {
            lexicon.addAll(text.getLexicon(poses));
        }
        return lexicon;
    }

    public List<String> getLexiconFromFile(String filename) {
        if (lexicon != null) {
            return lexicon;
        }
        HashSet<String> words_set = new HashSet<String>();
        try {
            BufferedReader br = new BufferedReader(new FileReader(filename));
            String line = "";
            while ((line = br.readLine()) != null) {
                words_set.add(line.split(" ")[1]);
            }
            br.close();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        lexicon = new ArrayList<String>(new TreeSet(words_set));
        return lexicon;
    }

    public IntegerCounter<String> getLexiconWithCounts(LEXICON lexicon) {
        switch (lexicon) {
            case CONTENT_WORDS: {
                return this.getLexiconWithCounts("n", "v", "a", "r");
            }
            case ANY: {
                return this.getLexiconWithCounts(new String[0]);
            }
        }
        return null;
    }

    public IntegerCounter<String> getLexiconWithCounts(String ... poses) {
        IntegerCounter<String> lexicon = new IntegerCounter<String>();
        File lexicon_file = new File(Configuration.getInstance().getBncLexiconFile());
        if (lexicon_file.exists()) {
            try {
                BufferedReader br = new BufferedReader(new FileReader(lexicon_file));
                String line = "";
                while ((line = br.readLine()) != null) {
                    String[] pieces = line.split(" ");
                    lexicon.count(pieces[1], new Integer(pieces[0]));
                }
                br.close();
            }
            catch (IOException e) {
                e.printStackTrace();
            }
            return lexicon;
        }
        for (BncText text : Bnc.getInstance()) {
            lexicon.addFrom(text.getLexiconWithCounts(poses));
        }
        return lexicon;
    }

    public HashMap<String, IntegerCounter<BncText>> getInvertedIndex(int k, LEXICON lexicon) {
        switch (lexicon) {
            case CONTENT_WORDS: {
                return this.getInvertedIndex(k, "n", "v", "a", "r");
            }
            case ANY: {
                return this.getInvertedIndex(k, new String[0]);
            }
        }
        return null;
    }

    public HashMap<String, IntegerCounter<BncText>> getInvertedIndex(int k, String ... poses) {
        HashMap<String, IntegerCounter<BncText>> index = new HashMap<String, IntegerCounter<BncText>>();
        List toplexicon = this.getLexiconWithCounts(poses).getTopK(k);
        for (BncText text : this) {
            IntegerCounter<String> lexiconWithCounts = text.getLexiconWithCounts(poses);
            for (String word : lexiconWithCounts.keySet()) {
                if (!toplexicon.contains(word)) continue;
                if (index.get(word) == null) {
                    index.put(word, new IntegerCounter(word));
                }
                index.get(word).count(text, (Integer)lexiconWithCounts.get(word));
            }
        }
        return index;
    }

    public Cooccurrences getCooccurrences() {
        Cooccurrences cooccurrences = new Cooccurrences();
        File cooccurrence_file = new File(Configuration.getInstance().getBncCooccurrenceFile());
        if (cooccurrence_file.exists()) {
            log.info((Object)("Loading file: " + cooccurrence_file.getAbsolutePath()));
            try {
                BufferedReader br = new BufferedReader(new FileReader(cooccurrence_file));
                String REGEX = "<(.+), (.+)> (\\d*)";
                Pattern pattern = Pattern.compile(REGEX);
                String line = "";
                while ((line = br.readLine()) != null) {
                    Matcher m = pattern.matcher(line);
                    m.find();
                    String w1 = m.group(1);
                    String w2 = m.group(2);
                    Integer count = new Integer(m.group(3));
                    cooccurrences.addCooccurrence(new UnorderedPair<String>(w1, w2), count);
                }
                br.close();
            }
            catch (IOException e) {
                e.printStackTrace();
            }
            log.info((Object)("File: " + cooccurrence_file.getAbsolutePath() + " loaded."));
            return cooccurrences;
        }
        log.info((Object)(String.valueOf(cooccurrence_file.getAbsolutePath()) + " does not exists. Calculating cooccurrences directly from the corpus."));
        int done = 0;
        for (BncText text : this) {
            cooccurrences.merge(text.getCooccurrences());
            if (++done % 500 != 0) continue;
            log.info((Object)(String.valueOf(done) + " texts parsed for cooccurrences."));
        }
        return cooccurrences;
    }

    public static void exportSentences(String filepath) {
        int sentences = 0;
        try {
            BufferedWriter fw = Files.getBufferedWriter(filepath);
            BncIterator iterator = new BncIterator();
            while (iterator.hasNext()) {
                BncText text = iterator.next();
                for (BncSentence sentence : text) {
                    fw.write(String.valueOf(sentence.getID()) + " " + sentence + "\n");
                    if (++sentences % 1000 != 0) continue;
                    log.info((Object)(String.valueOf(sentences) + " sentences written out."));
                }
                fw.flush();
            }
            fw.close();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        log.info((Object)("Total BNC sentences: " + sentences));
    }

    public static void main(String[] args) throws IOException {
        String[] words;
        Timer general = new Timer();
        Bnc bncdi = Bnc.getInstance();
        boolean print_overview = true;
        boolean print_lexicon = false;
        boolean build_lexiconWithCounts = false;
        boolean print_invertedIndex = false;
        boolean build_cooccurrences = false;
        boolean test_cooccurrences = false;
        boolean testCompositionality = false;
        boolean build_matrix = false;
        boolean buildContextMatrix = false;
        boolean seeWords = false;
        LEXICON lexicon_choice = LEXICON.CONTENT_WORDS;
        int topK = Configuration.getInstance().getBncTopKWords();
        int window = Configuration.getInstance().getBncCooccurrenceWindow();
        DOCUMENT_TYPE_FILTER docs = DOCUMENT_TYPE_FILTER.valueOf(Configuration.getInstance().getBncDocuments());
        String contextMatrixPath = Configuration.getInstance().getBncMatrixOfContextsPath();
        String cooccurrenceMatrixPath = Configuration.getInstance().getBncMatrixOfCooccurrencesPath();
        log.info((Object)("LEXICON: " + (Object)((Object)lexicon_choice)));
        log.info((Object)("TopK: " + topK));
        log.info((Object)("Window size: " + window));
        log.info((Object)("DOCUMENT_TYPE_FILTER: " + (Object)((Object)docs)));
        log.info((Object)("Contexts matrix path: " + contextMatrixPath));
        log.info((Object)("Cooccurrence matrix path: " + cooccurrenceMatrixPath));
        if (print_overview) {
            log.info((Object)"=== BNC - OVERVIEW ===");
            int num_sentences = 0;
            int num_words = 0;
            for (BncText text : bncdi) {
                log.info((Object)("Id: " + text.getId()));
                log.info((Object)("\tType: " + (Object)((Object)text.getType())));
                log.info((Object)("\tTitle: " + text.getTitle()));
                log.info((Object)("\tDomain: " + text.getDomain()));
                log.info((Object)("\tKeywords: " + text.getKeywords()));
                log.info((Object)("\tCategory: " + text.getCategory()));
                log.info((Object)("\tMedium: " + text.getMedium()));
                for (Object sentence : text) {
                    ++num_sentences;
                    num_words += ((StanfordSentence)sentence).getWords().size();
                }
            }
            log.info((Object)("|Texts(BNC)| = " + Bnc.filesNumber()));
            log.info((Object)("|Sentences(BNC)| = " + num_sentences));
            log.info((Object)("|Words(BNC)| = " + num_words));
        }
        if (print_lexicon) {
            log.info((Object)"=== BNC - LEXICON===");
            Timer lexicon_timer = new Timer();
            Set<String> lexicon = bncdi.getLexicon(lexicon_choice);
            log.info((Object)("Simple Lexicon built in " + lexicon_timer.getTimeInString()));
            log.info((Object)("|Lexicon|=" + lexicon.size()));
            for (String word : lexicon) {
                log.info((Object)word);
            }
        }
        if (build_lexiconWithCounts) {
            log.info((Object)"=== BNC - LEXICON WITH COUNTS ===");
            Timer lexiconWithCounts_timer = new Timer();
            IntegerCounter<String> lexiconWithCounts = bncdi.getLexiconWithCounts(lexicon_choice);
            log.info((Object)("Lexicon with counts built in " + lexiconWithCounts_timer.getTimeInString()));
            log.info((Object)("|Lexicon|=" + lexiconWithCounts.size()));
            BufferedWriter wr = new BufferedWriter(new FileWriter("bnc_lexiconcounts-all.txt"));
            for (Object word : lexiconWithCounts.getSortedElements()) {
                wr.write(lexiconWithCounts.get((String)word) + " " + (String)word + "\n");
            }
            wr.close();
        }
        if (print_invertedIndex) {
            log.info((Object)"=== BNC - INVERTED INDEX ===");
            Timer invertedIndex_timer = new Timer();
            HashMap<String, IntegerCounter<BncText>> invertedIndex = bncdi.getInvertedIndex(10, lexicon_choice);
            log.info((Object)("Inverted index built in " + invertedIndex_timer.getTimeInString()));
            for (String word : invertedIndex.keySet()) {
                String output = "";
                IntegerCounter<BncText> counter = invertedIndex.get(word);
                for (BncText text : counter.keySet()) {
                    output = String.valueOf(output) + " (" + text.getId() + "," + counter.get(text) + ")";
                }
                log.info((Object)(String.valueOf(word) + " -> " + output));
            }
        }
        if (build_cooccurrences) {
            log.info((Object)"=== BNC - COOCCURRENCES ===");
            Timer lexiconWithCounts_timer = new Timer();
            IntegerCounter<String> lexicon = bncdi.getLexiconWithCounts(lexicon_choice);
            List topKwords = lexicon.getTopK(topK);
            log.info((Object)("Top-" + topK + " Lexicon with counts built in " + lexiconWithCounts_timer.getTimeInString()));
            Timer cooccurrences_timer = new Timer();
            Cooccurrences cooccurrences = bncdi.getCooccurrences();
            log.info((Object)("Cooccurrences built in " + cooccurrences_timer.getTimeInString()));
            log.info((Object)(String.valueOf(cooccurrences.getTotalWithoutRepetitions()) + " cooccorrences found."));
            int seen = 0;
            int selected = 0;
            BufferedWriter wr = new BufferedWriter(new FileWriter(Configuration.getInstance().getBncCooccurrenceFile()));
            for (UnorderedPair<String> cooccurrence : cooccurrences) {
                if (++seen % 10000000 == 0) {
                    log.info((Object)(String.valueOf(seen) + " occurrences seen."));
                }
                ArrayList<String> c = new ArrayList<String>();
                c.add((String)cooccurrence.getFirst());
                c.add((String)cooccurrence.getSecond());
                if (!topKwords.containsAll(c)) continue;
                wr.write(cooccurrence + " " + cooccurrences.getCount(cooccurrence) + "\n");
                if (++selected % 1000000 != 0) continue;
                log.info((Object)(String.valueOf(selected) + " occurrences selected."));
            }
            wr.close();
        }
        if (build_matrix) {
            log.info((Object)"=== BNC - COOCCURRENCE MATRIX ===");
            Cooccurrences occ = bncdi.getCooccurrences();
            IntegerCounter<String> counts = occ.getLexicon();
            words = new TreeSet(counts.keySet()).toArray(new String[0]);
            int n = words.length;
            log.info((Object)("Building a cooccurrence matrix of dimension " + n + " x " + n));
            DoubleMatrix matrix = new DoubleMatrix(n, n);
            int i = 0;
            while (i < n) {
                String w1 = words[i];
                log.info((Object)("Calculating row for " + w1));
                int count1 = (Integer)counts.get(w1);
                int j = 0;
                while (j < n) {
                    String w2 = words[j];
                    int count2 = (Integer)counts.get(w2);
                    int joint = occ.getCount(new UnorderedPair<String>(w1, w2));
                    double pmi = Maths.pmi(joint, count1, count2);
                    matrix.put(i, j, pmi);
                    matrix.put(j, i, pmi);
                    ++j;
                }
                ++i;
            }
            log.info((Object)("Storing cooccurrences matrix to " + cooccurrenceMatrixPath));
            matrix.save(cooccurrenceMatrixPath);
            log.info((Object)("Non-zero entries: " + matrix.findIndices().length));
        }
        if (buildContextMatrix) {
            Cooccurrences occ = bncdi.getCooccurrences();
            log.debug((Object)(String.valueOf(occ.getTotalWithoutRepetitions()) + " occurrences loaded (for a total of " + occ.getTotal() + ")"));
            IntegerCounter<String> counts = occ.getLexicon();
            log.debug((Object)("Lexicon with counts: " + counts.keySet().size()));
            words = new TreeSet(counts.keySet()).toArray(new String[0]);
            int n = words.length;
            double total = counts.getTotal();
            log.debug((Object)("Creating a context matrix " + n + "*" + n));
            DoubleMatrix matrix = new DoubleMatrix(n, n);
            int i = 0;
            while (i < n) {
                String w1 = words[i];
                log.debug((Object)("Calculating row vector for " + w1));
                int count1 = (Integer)counts.get(w1);
                int j = 0;
                while (j < n) {
                    String w2 = words[j];
                    int count2 = (Integer)counts.get(w2);
                    double joint = occ.getCount(new UnorderedPair<String>(w1, w2));
                    double pconditioned = joint / (double)count1;
                    double p2 = (double)count2 / total;
                    matrix.put(i, j, pconditioned / p2);
                    ++j;
                }
                ++i;
            }
            log.info((Object)("Storing context matrix to " + contextMatrixPath));
            matrix.save(contextMatrixPath);
        }
        if (seeWords) {
            Cooccurrences occ = bncdi.getCooccurrences();
            IntegerCounter<String> counts = occ.getLexicon();
            words = new TreeSet(counts.keySet()).toArray(new String[0]);
            int n = words.length;
            double total = counts.getTotal();
            NumberFormat nf = NumberFormat.getInstance(Locale.UK);
            nf.setMaximumFractionDigits(6);
            BufferedWriter wr_best = new BufferedWriter(new FileWriter("bnc_best_words.txt"));
            int i = 0;
            while (i < n) {
                double joint;
                String w1 = words[i];
                int count_word1 = (Integer)counts.get(w1);
                wr_best.write(String.valueOf(w1) + " [" + counts.keySet().size() + " words]\tfreq(" + w1 + ")=" + count_word1 + "\n");
                DoubleCounter<String> rank = new DoubleCounter<String>();
                DoubleCounter<String> joint_rank = new DoubleCounter<String>();
                int j = 0;
                while (j < n) {
                    String w2 = words[j];
                    int count_word2 = (Integer)counts.get(w2);
                    joint = occ.getCount(new UnorderedPair<String>(w1, w2));
                    double pconditioned = joint / (double)count_word1;
                    double p2 = (double)count_word2 / total;
                    joint_rank.count(w2, joint);
                    rank.count(w2, pconditioned / p2);
                    ++j;
                }
                for (String w2 : rank.getTopK(50)) {
                    int count_word2 = (Integer)counts.get(w2);
                    joint = (Double)joint_rank.get(w2);
                    double v = (Double)rank.get(w2);
                    wr_best.write("\t" + w2 + "\t\t" + "v(" + w2 + ")=" + nf.format(v) + "\tp(" + w2 + "|" + w1 + ")=" + nf.format(joint / (double)count_word1) + "\tp(" + w2 + ")=" + nf.format((double)count_word2 / total) + "\tfreq(" + w1 + "," + w2 + ")=" + (int)joint + "\tfreq(" + w2 + ")=" + count_word2 + "\tfreq(" + w1 + ")=" + count_word1 + "\tfreq(total)=" + (int)total + "\n");
                }
                ++i;
            }
            wr_best.close();
        }
        if (testCompositionality) {
            String[] sentences = new String[]{"plane#n fly#v blue#a sky#n", "choose#v flight#n head#v home#n", "eat#v lunch#n wife#n", "dinner#n prefer#v sleep#v", "job#n hard#a day#n night#n"};
            ArrayList<String> lexicon = new ArrayList<String>(new TreeSet(bncdi.getCooccurrences().getLexicon().keySet()));
            DoubleMatrix contextMatrix = new DoubleMatrix();
            contextMatrix.load(contextMatrixPath);
            SentenceToVector converter = new SentenceToVector(contextMatrix, lexicon);
            int k = 1;
            log.info((Object)"");
            Object[] i = sentences;
            int wr_best = sentences.length;
            int nf = 0;
            while (nf < wr_best) {
                String sentence = i[nf];
                log.info((Object)("Sentence" + k++ + ": " + sentence));
                ++nf;
            }
            i = VectorComposition.values();
            wr_best = i.length;
            nf = 0;
            while (nf < wr_best) {
                String composition = i[nf];
                log.info((Object)("\n\n\nVECTOR COMPOSITION: " + composition));
                DoubleMatrix[] sentences_vectors = new DoubleMatrix[sentences.length];
                int i2 = 0;
                while (i2 < sentences.length) {
                    DoubleMatrix v;
                    String sentence = sentences[i2];
                    List<String> words2 = Arrays.asList(sentence.split(" "));
                    sentences_vectors[i2] = v = converter.convertSentence(words2, (VectorComposition)((Object)composition));
                    log.debug((Object)("Sentence \"" + sentence + "\" => " + v.findIndices().length + " non-zero entries."));
                    ++i2;
                }
                i2 = 0;
                while (i2 < sentences.length) {
                    int j = i2 + 1;
                    while (j < sentences.length) {
                        log.info((Object)("Similarity(\"" + sentences[i2] + "\", \"" + sentences[j] + "\") = " + JLTMatrix.cos_sim(sentences_vectors[i2], sentences_vectors[j])));
                        ++j;
                    }
                    ++i2;
                }
                ++nf;
            }
        }
        if (test_cooccurrences) {
            Cooccurrences cooccurrences = new Cooccurrences();
            String input = "a a b c a b c a a b c";
            log.info((Object)("Original sentence : " + input));
            String[] letters = input.split(" ");
            StanfordSentence s = new StanfordSentence();
            String[] wr_best = letters;
            int nf = letters.length;
            int composition = 0;
            while (composition < nf) {
                String letter = wr_best[composition];
                s.addWord(new WordLemmaTag("", letter, ""));
                ++composition;
            }
            WordLemmaTag[] wordArray = s.getWords().toArray(new WordLemmaTag[0]);
            int width = Configuration.getInstance().getBncCooccurrenceWindow();
            int i = 0;
            while (i < wordArray.length) {
                WordLemmaTag w1 = wordArray[i];
                String word1 = String.valueOf(w1.lemma()) + "#" + w1.tag();
                log.debug((Object)("First word = " + word1));
                int right = Math.min(wordArray.length - 1, i + width);
                int j = i + 1;
                while (j <= right) {
                    WordLemmaTag w2 = wordArray[j];
                    String word2 = String.valueOf(w2.lemma()) + "#" + w2.tag();
                    log.debug((Object)("\tSecond word = " + word2));
                    UnorderedPair<String> pair = new UnorderedPair<String>(word1, word2);
                    cooccurrences.addCooccurrence(pair);
                    log.info(pair);
                    ++j;
                }
                log.info((Object)"");
                ++i;
            }
            IntegerCounter<String> lexicon = cooccurrences.getLexicon();
            for (String letter : lexicon.keySet()) {
                log.info((Object)("Freq(" + letter + ") = " + lexicon.get(letter)));
            }
            for (UnorderedPair pair : cooccurrences) {
                log.info((Object)("Freq" + pair + " = " + cooccurrences.getCount(pair)));
            }
        }
        log.info((Object)("PROJECT TIME ELAPSED " + general.getTimeInString()));
    }

    protected static enum DOCUMENT_TYPE_FILTER {
        WRITTEN,
        WRITTEN_AND_SPOKEN;

    }

    protected static enum LEXICON {
        CONTENT_WORDS,
        ANY;

    }
}

