keyword_extractor.js 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. var _ = require("underscore");
  2. _.str = require('underscore.string');
  3. var supported_languages = ["danish","dutch","english","french","galician","german","italian","polish","portuguese","romanian","russian","spanish","swedish"];
  4. var stopwords = require("./stopwords/stopwords");
  5. function _extract(str, options){
  6. if(_.isEmpty(str)){
  7. return [];
  8. }
  9. if(_.isEmpty(options)){
  10. options = {
  11. remove_digits: true,
  12. return_changed_case: true
  13. };
  14. }
  15. var return_changed_case = options.return_changed_case;
  16. var return_chained_words = options.return_chained_words;
  17. var remove_digits = options.remove_digits;
  18. var _language = options.language || "english";
  19. var _remove_duplicates = options.remove_duplicates || false;
  20. var return_max_ngrams = options.return_max_ngrams;
  21. if(supported_languages.indexOf(_language) < 0){
  22. throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
  23. }
  24. // strip any HTML and trim whitespace
  25. var text = _.str.trim(_.str.stripTags(str));
  26. if(_.isEmpty(text)){
  27. return [];
  28. }else{
  29. var words = text.split(/\s/);
  30. var unchanged_words = [];
  31. var low_words = [];
  32. // change the case of all the words
  33. for(var x = 0;x < words.length; x++){
  34. var w = words[x].match(/https?:\/\/.*[\r\n]*/g) ? words[x] : words[x].replace(/\.|,|;|!|\?|\(|\)|:|"|^'|'$|“|”|‘|’/g,'');
  35. // remove periods, question marks, exclamation points, commas, and semi-colons
  36. // if this is a short result, make sure it's not a single character or something 'odd'
  37. if(w.length === 1){
  38. w = w.replace(/-|_|@|&|#/g,'');
  39. }
  40. // if it's a number, remove it
  41. var digits_match = w.match(/\d/g);
  42. if(remove_digits && digits_match && digits_match.length === w.length){
  43. w = "";
  44. }
  45. if(w.length > 0){
  46. low_words.push(w.toLowerCase());
  47. unchanged_words.push(w);
  48. }
  49. }
  50. var results = [];
  51. var _stopwords = options.stopwords || _getStopwords({ language: _language });
  52. var _last_result_word_index = 0;
  53. var _start_result_word_index = 0;
  54. var _unbroken_word_chain = false;
  55. for(var y = 0; y < low_words.length; y++){
  56. if(_stopwords.indexOf(low_words[y]) < 0){
  57. if(_last_result_word_index !== y - 1){
  58. _start_result_word_index = y;
  59. _unbroken_word_chain = false;
  60. } else {
  61. _unbroken_word_chain = true;
  62. }
  63. var result_word = return_changed_case && !unchanged_words[y].match(/https?:\/\/.*[\r\n]*/g) ? low_words[y] : unchanged_words[y];
  64. if (return_max_ngrams && _unbroken_word_chain && !return_chained_words && return_max_ngrams > (y - _start_result_word_index) && _last_result_word_index === y - 1){
  65. var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
  66. results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
  67. } else if (return_chained_words && _last_result_word_index === y - 1) {
  68. var change_pos = results.length - 1 < 0 ? 0 : results.length - 1;
  69. results[change_pos] = results[change_pos] ? results[change_pos] + ' ' + result_word : result_word;
  70. } else {
  71. results.push(result_word);
  72. }
  73. _last_result_word_index = y;
  74. } else {
  75. _unbroken_word_chain = false;
  76. }
  77. }
  78. if(_remove_duplicates) {
  79. results= _.uniq(results, function (item) {
  80. return item;
  81. });
  82. }
  83. return results;
  84. }
  85. }
  86. function _getStopwords(options){
  87. options = options || {};
  88. var _language = options.language || "english";
  89. if(supported_languages.indexOf(_language) < 0){
  90. throw new Error("Language must be one of ["+supported_languages.join(",")+"]");
  91. }
  92. return stopwords[_language];
  93. }
  94. module.exports = {
  95. extract:_extract,
  96. getStopwords: _getStopwords
  97. };