/*
 * Decompiled with CFR 0.152.
 */
package com.github.pmerienne.trident.ml.preprocessing;

import com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer;
import java.util.List;
import org.apache.commons.lang.StringEscapeUtils;

public class TwitterTokenizer
extends EnglishTokenizer {
    private static final long serialVersionUID = -2486285775626564821L;
    private static final String URL_REGEX = "((www\\.[\\s]+)|(https?://[^\\s]+))";
    private static final String CONSECUTIVE_CHARS = "([a-z])\\1{1,}";
    private static final String STARTS_WITH_NUMBER = "[1-9]\\s*(\\w+)";

    public TwitterTokenizer() {
    }

    public TwitterTokenizer(int minNGram, int maxNGram) {
        super(minNGram, maxNGram);
    }

    @Override
    public List<String> tokenize(String text) {
        text = this.preprocess(text);
        return super.tokenize(text);
    }

    protected String preprocess(String tweet) {
        tweet = tweet.replaceAll(URL_REGEX, "");
        tweet = tweet.replaceAll("@([^\\s]+)", "");
        tweet = tweet.replaceAll(CONSECUTIVE_CHARS, "$1");
        tweet = tweet.replaceAll(STARTS_WITH_NUMBER, "");
        tweet = tweet.replaceAll("&amp;", "&");
        tweet = StringEscapeUtils.unescapeHtml((String)tweet);
        return tweet;
    }
}

