| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110 |
- var _ = require("underscore");
- _.str = require('underscore.string');
- var supported_languages = ["danish","dutch","english","french","galician","german","italian","polish","portuguese","romanian","russian","spanish","swedish"];
- var stopwords = require("./stopwords/stopwords");
- function _extract(str, options){
- if(_.isEmpty(str)){
- return [];
- }
- if(_.isEmpty(options)){
- options = {
- remove_digits: true,
- return_changed_case: true
- };
- }
- var return_changed_case = options.return_changed_case;
- var return_chained_words = options.return_chained_words;
- var remove_digits = options.remove_digits;
- var _language = options.language || "english";
- var _remove_duplicates = options.remove_duplicates || false;
- var return_max_ngrams = options.return_max_ngrams;
- if(supported_languages.indexOf(_language) < 0){
- throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
- }
- // strip any HTML and trim whitespace
- var text = _.str.trim(_.str.stripTags(str));
- if(_.isEmpty(text)){
- return [];
- }else{
- var words = text.split(/\s/);
- var unchanged_words = [];
- var low_words = [];
- // change the case of all the words
- for(var x = 0;x < words.length; x++){
- var w = words[x].match(/https?:\/\/.*[\r\n]*/g) ? words[x] : words[x].replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’/g,'');
- // remove periods, question marks, exclamation points, commas, and semi-colons
- // if this is a short result, make sure it's not a single character or something 'odd'
- if(w.length === 1){
- w = w.replace(/-|_|@|&|#/g,'');
- }
- // if it's a number, remove it
- var digits_match = w.match(/\d/g);
- if(remove_digits && digits_match && digits_match.length === w.length){
- w = "";
- }
- if(w.length > 0){
- low_words.push(w.toLowerCase());
- unchanged_words.push(w);
- }
- }
- var results = [];
- var _stopwords = options.stopwords || _getStopwords({ language: _language });
- var _last_result_word_index = 0;
- var _start_result_word_index = 0;
- var _unbroken_word_chain = false;
- for(var y = 0; y < low_words.length; y++){
- if(_stopwords.indexOf(low_words[y]) < 0){
-
- if(_last_result_word_index !== y - 1){
- _start_result_word_index = y;
- _unbroken_word_chain = false;
- } else {
- _unbroken_word_chain = true;
- }
- var result_word = return_changed_case && !unchanged_words[y].match(/https?:\/\/.*[\r\n]*/g) ? low_words[y] : unchanged_words[y];
-
- if (return_max_ngrams && _unbroken_word_chain && !return_chained_words && return_max_ngrams > (y - _start_result_word_index) && _last_result_word_index === y - 1){
- var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
- results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
- } else if (return_chained_words && _last_result_word_index === y - 1) {
- var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
- results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
- } else {
- results.push(result_word);
- }
- _last_result_word_index = y;
- } else {
- _unbroken_word_chain = false;
- }
- }
- if(_remove_duplicates) {
- results= _.uniq(results, function (item) {
- return item;
- });
- }
- return results;
- }
- }
- function _getStopwords(options){
- options = options || {};
- var _language = options.language || "english";
- if(supported_languages.indexOf(_language) < 0){
- throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
- }
- return stopwords[_language];
- }
- module.exports = {
- extract:_extract,
- getStopwords: _getStopwords
- };
|