/*
 * Decompiled with CFR 0.152.
 */
package edu.mayo.bmi.nlp.tokenizer;

import edu.mayo.bmi.nlp.tokenizer.OffsetComparator;
import edu.mayo.bmi.nlp.tokenizer.Token;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;

public class Tokenizer {
    private OffsetComparator iv_offsetComp = new OffsetComparator();
    private Map<String, Integer> iv_hyphMap;
    private int iv_freqCutoff;

    public Tokenizer() {
    }

    public Tokenizer(Map<String, Integer> hyphMap, int freqCutoff) {
        this.iv_hyphMap = hyphMap;
        this.iv_freqCutoff = freqCutoff;
    }

    public static void validateHyphenMap(Map<String, Integer> hyphMap) throws Exception {
        for (String key : hyphMap.keySet()) {
            Integer val = hyphMap.get(key);
            if (val == null) {
                throw new Exception("Hyphen map is missing frequency data for key=" + key);
            }
            if (val instanceof Integer) continue;
            throw new Exception("Hyphen map has non java.lang.Integer frequency data for key=" + key);
        }
    }

    public List<Token> tokenizeAndSort(String text) throws Exception {
        List<Token> tokenList = this.tokenize(text);
        Collections.sort(tokenList, this.iv_offsetComp);
        return tokenList;
    }

    public List<Token> tokenize(String text) throws Exception {
        try {
            List<Token> eolTokens = this.getEndOfLineTokens(text);
            List<Token> tokens = this.getRawTokens(text);
            this.applyPunctSymbolRules(tokens, text);
            int i = 0;
            while (i < tokens.size()) {
                Token token = tokens.get(i);
                String tokenText = text.substring(token.getStartOffset(), token.getEndOffset());
                if (token.getType() != 3) {
                    if (Tokenizer.isNumber(tokenText)) {
                        token.setType((byte)2);
                        token.setIsInteger(this.isInteger(tokenText));
                    }
                    if (token.getType() == 0) {
                        token.setType((byte)1);
                    }
                    if (token.getType() == 1) {
                        this.applyCapitalizationRules(token, tokenText);
                        this.applyWordNumRules(token, tokenText);
                    }
                }
                ++i;
            }
            tokens.addAll(eolTokens);
            i = 0;
            while (i < tokens.size()) {
                Token t = tokens.get(i);
                t.setText(text.substring(t.getStartOffset(), t.getEndOffset()));
                ++i;
            }
            return tokens;
        }
        catch (Exception e) {
            e.printStackTrace();
            throw new Exception("Internal Error with Tokenizer.");
        }
    }

    private void applyPunctSymbolRules(List<Token> tokens, String text) {
        ArrayList<Token> newTokenList = new ArrayList<Token>();
        ArrayList<Token> removeTokenList = new ArrayList<Token>();
        int tIndex = 0;
        while (tIndex < tokens.size()) {
            Token token = tokens.get(tIndex);
            String tokenText = text.substring(token.getStartOffset(), token.getEndOffset());
            if (tokenText.length() == 1) {
                char currentChar = tokenText.charAt(0);
                if (!this.isAlphabetLetterOrDigit(currentChar)) {
                    if (this.isPunctuation(currentChar)) {
                        token.setType((byte)3);
                    } else {
                        token.setType((byte)6);
                    }
                }
            } else {
                boolean foundSomethingInside;
                int aposIndex;
                int startCnt = this.processStartPunctSymbol(newTokenList, token, tokenText);
                token.setStartOffset(token.getStartOffset() + startCnt);
                tokenText = text.substring(token.getStartOffset(), token.getEndOffset());
                int endCnt = this.processEndPunctSymbol(newTokenList, token, tokenText);
                token.setEndOffset(token.getEndOffset() - endCnt);
                if (token.getStartOffset() == token.getEndOffset()) {
                    removeTokenList.add(token);
                }
                if ((aposIndex = (tokenText = text.substring(token.getStartOffset(), token.getEndOffset())).indexOf(39)) != -1) {
                    Token cpToken = null;
                    String afterAposStr = tokenText.substring(aposIndex + 1, tokenText.length());
                    if (afterAposStr.length() == 1) {
                        String beforeAposChar;
                        if (afterAposStr.equalsIgnoreCase("d") || afterAposStr.equalsIgnoreCase("m") || afterAposStr.equalsIgnoreCase("s")) {
                            cpToken = new Token(token.getStartOffset() + aposIndex, token.getEndOffset());
                        } else if (afterAposStr.equalsIgnoreCase("t") && (beforeAposChar = tokenText.substring(aposIndex - 1, aposIndex)).equalsIgnoreCase("n")) {
                            cpToken = new Token(token.getStartOffset() + aposIndex - 1, token.getEndOffset());
                        }
                    } else if (afterAposStr.length() == 2 && (afterAposStr.equalsIgnoreCase("re") || afterAposStr.equalsIgnoreCase("ve") || afterAposStr.equalsIgnoreCase("ll"))) {
                        cpToken = new Token(token.getStartOffset() + aposIndex, token.getEndOffset());
                    }
                    if (cpToken != null) {
                        cpToken.setType((byte)5);
                        newTokenList.add(cpToken);
                        token.setEndOffset(cpToken.getStartOffset());
                    }
                } else if (tokenText.equalsIgnoreCase("cannot")) {
                    Token notToken = new Token(token.getStartOffset() + 3, token.getEndOffset());
                    notToken.setType((byte)1);
                    newTokenList.add(notToken);
                    token.setEndOffset(token.getStartOffset() + 3);
                }
                if (foundSomethingInside = this.findPunctSymbolInsideToken(tokens, token, tokenText = text.substring(token.getStartOffset(), token.getEndOffset()))) {
                    removeTokenList.add(token);
                }
            }
            ++tIndex;
        }
        tokens.addAll(newTokenList);
        int i = 0;
        while (i < removeTokenList.size()) {
            Token tokenToBeRemoved = (Token)removeTokenList.get(i);
            tokens.remove(tokenToBeRemoved);
            ++i;
        }
    }

    private int processStartPunctSymbol(List<Token> newTokenList, Token token, String tokenText) {
        int count = 0;
        int i = 0;
        while (i < tokenText.length()) {
            char currentChar = tokenText.charAt(i);
            if (!this.isAlphabetLetterOrDigit(currentChar)) {
                Token t = new Token(token.getStartOffset() + i, token.getStartOffset() + i + 1);
                if (this.isPunctuation(currentChar)) {
                    t.setType((byte)3);
                } else {
                    t.setType((byte)6);
                }
                newTokenList.add(t);
                ++count;
            } else {
                return count;
            }
            ++i;
        }
        return count;
    }

    private int processEndPunctSymbol(List<Token> newTokenList, Token token, String tokenText) {
        int count = 0;
        int i = tokenText.length() - 1;
        while (i >= 0) {
            char currentChar = tokenText.charAt(i);
            if (!this.isAlphabetLetterOrDigit(currentChar)) {
                Token t = new Token(token.getStartOffset() + i, token.getStartOffset() + i + 1);
                if (this.isPunctuation(currentChar)) {
                    t.setType((byte)3);
                } else {
                    t.setType((byte)6);
                }
                newTokenList.add(t);
                ++count;
            } else {
                return count;
            }
            --i;
        }
        return count;
    }

    private int getFirstInsidePunctSymbol(String tokenText) {
        int i = 0;
        while (i < tokenText.length()) {
            char currentChar = tokenText.charAt(i);
            if (currentChar == ',' && !Tokenizer.isNumber(tokenText)) {
                return i;
            }
            if (currentChar == '.' && !Tokenizer.isNumber(tokenText)) {
                return i;
            }
            if (!this.isAlphabetLetterOrDigit(currentChar) && currentChar != '.' && currentChar != ',' && currentChar != ':' && currentChar != ';') {
                return i;
            }
            ++i;
        }
        return -1;
    }

    private boolean findPunctSymbolInsideToken(List<Token> tokens, Token token, String tokenText) {
        int startOffset = token.getStartOffset();
        int punctSymbolOffset = this.getFirstInsidePunctSymbol(tokenText);
        if (punctSymbolOffset != -1) {
            int freq;
            char c = tokenText.charAt(punctSymbolOffset);
            if (c == '-' && this.iv_hyphMap != null && this.iv_hyphMap.containsKey(tokenText.toLowerCase()) && (freq = this.iv_hyphMap.get(tokenText.toLowerCase()).intValue()) > this.iv_freqCutoff) {
                if (!tokens.contains(token)) {
                    tokens.add(token);
                    return true;
                }
                return false;
            }
            Token t = new Token(startOffset + punctSymbolOffset, startOffset + punctSymbolOffset + 1);
            if (this.isPunctuation(c)) {
                t.setType((byte)3);
            } else {
                t.setType((byte)6);
            }
            tokens.add(t);
            if (startOffset != t.getStartOffset()) {
                Token leftToken = new Token(startOffset, t.getStartOffset());
                tokens.add(leftToken);
            }
            Token rightToken = new Token(t.getEndOffset(), token.getEndOffset());
            String rightTokenText = tokenText.substring(punctSymbolOffset + 1, tokenText.length());
            return this.findPunctSymbolInsideToken(tokens, rightToken, rightTokenText);
        }
        if (!tokens.contains(token)) {
            tokens.add(token);
            return true;
        }
        return false;
    }

    private boolean isPunctuation(char c) {
        return c == ';' || c == ':' || c == ',' || c == '.' || c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}' || c == '<' || c == '>' || c == '\'' || c == '\"' || c == '/' || c == '\\' || c == '-';
    }

    private boolean isAlphabetLetterOrDigit(char c) {
        if (this.isAlphabetLetter(c)) {
            return true;
        }
        return this.isDigit(c);
    }

    public boolean isAlphabetLetter(char c) {
        int unicode = Character.getNumericValue(c);
        return unicode >= 10 && unicode <= 35;
    }

    private boolean isDigit(char c) {
        int unicode = Character.getNumericValue(c);
        return unicode >= 0 && unicode <= 9;
    }

    /*
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    public static boolean isNumber(String tokenText) {
        int decimalPoint = 46;
        boolean foundDecimalPoint = false;
        int charsBeforeDecimal = 0;
        int i = tokenText.length() - 1;
        while (i >= 0) {
            char currentChar = tokenText.charAt(i);
            if (!Character.isDigit(currentChar)) {
                if (currentChar == '.' && !foundDecimalPoint) {
                    foundDecimalPoint = true;
                    charsBeforeDecimal = 0;
                } else {
                    if (currentChar != ',') return false;
                    if (charsBeforeDecimal % 3 != 0) {
                        return false;
                    }
                }
            } else {
                ++charsBeforeDecimal;
            }
            --i;
        }
        return true;
    }

    private boolean isInteger(String tokenText) {
        return tokenText.indexOf(46) == -1;
    }

    private void applyCapitalizationRules(Token token, String tokenText) {
        boolean[] uppercaseMask = new boolean[tokenText.length()];
        boolean isAllUppercase = true;
        boolean isAllLowercase = true;
        int i = 0;
        while (i < tokenText.length()) {
            char currentChar = tokenText.charAt(i);
            uppercaseMask[i] = Character.isUpperCase(currentChar);
            if (!uppercaseMask[i]) {
                isAllUppercase = false;
            } else {
                isAllLowercase = false;
            }
            ++i;
        }
        if (isAllLowercase) {
            token.setCaps((byte)1);
        } else if (isAllUppercase) {
            token.setCaps((byte)4);
        } else if (uppercaseMask[0]) {
            if (uppercaseMask.length == 1) {
                token.setCaps((byte)3);
                return;
            }
            boolean isRestLowercase = true;
            int i2 = 1;
            while (i2 < uppercaseMask.length) {
                if (uppercaseMask[i2]) {
                    isRestLowercase = false;
                }
                ++i2;
            }
            if (isRestLowercase) {
                token.setCaps((byte)3);
            } else {
                token.setCaps((byte)2);
            }
        } else {
            token.setCaps((byte)2);
        }
    }

    private void applyWordNumRules(Token token, String tokenText) {
        boolean[] digitMask = new boolean[tokenText.length()];
        boolean isAllLetters = true;
        int i = 0;
        while (i < tokenText.length()) {
            char currentChar = tokenText.charAt(i);
            digitMask[i] = Character.isDigit(currentChar);
            if (digitMask[i]) {
                isAllLetters = false;
            }
            ++i;
        }
        if (isAllLetters) {
            token.setNumPosition((byte)0);
        } else if (digitMask[0]) {
            token.setNumPosition((byte)1);
        } else if (digitMask[tokenText.length() - 1]) {
            token.setNumPosition((byte)3);
        } else {
            token.setNumPosition((byte)2);
        }
    }

    private List<Token> getEndOfLineTokens(String text) {
        int crChar = 13;
        int nlChar = 10;
        ArrayList<Token> eolTokens = new ArrayList<Token>();
        int i = 0;
        while (i < text.length()) {
            char currentChar = text.charAt(i);
            if (currentChar == '\r' || currentChar == '\n') {
                Token t = new Token(i, i + 1);
                t.setType((byte)4);
                eolTokens.add(t);
            }
            ++i;
        }
        return eolTokens;
    }

    private List<Token> getRawTokens(String text) {
        int wsChar = 32;
        int tabChar = 9;
        int newlineChar = 10;
        boolean insideText = false;
        int startIndex = 0;
        int endIndex = 0;
        ArrayList<Token> rawTokens = new ArrayList<Token>();
        int i = 0;
        while (i < text.length()) {
            char currentChar = text.charAt(i);
            if (currentChar != ' ' && currentChar != '\t' && currentChar != '\n') {
                if (!insideText) {
                    insideText = true;
                    startIndex = i;
                }
            } else if (insideText) {
                insideText = false;
                endIndex = i;
                Token t = new Token(startIndex, endIndex);
                rawTokens.add(t);
            }
            ++i;
        }
        if (insideText) {
            insideText = false;
            endIndex = text.length();
            Token t = new Token(startIndex, endIndex);
            rawTokens.add(t);
        }
        return rawTokens;
    }
}

