text-methods.js

/**
 * Text methods
 * @namespace TextMethods
 */

/**
 * Cleans text and strips out unwanted symbols/patterns.
 *
 * @memberof TextMethods
 * @param  {string} content - the chunk of text we will be operating on
 * @param  {boolean} [filterChars=true] - whether or not we want to filter out single characters
 * @returns {string[]} An array of cleaned text (tokens) we consider to be spell checkable
 */
function cleanText (content, filterChars = true) {
  if (!content || typeof content !== 'string') {
    console.warn(`Multidict: cannot clean falsy, undefined, or non-string content: "${content}"`)
    return []
  }

  const rxUrls = /(http:\/\/|https:\/\/|ftp:\/\/|www.)/
  // TODO: at some point look into using /[^\p{L}'-]/u which uses unicode ranges to match anything
  // that is not a valid letter, hyphen, or apostrophe
  const rxSeparators = /[.,:;!?¿_<>{}()[\]"`´^$°§½¼³%&¬+=*~#|/\\]/
  const rxSingleQuotes = /^'+|'+$/g

  // split all content by spaces, or new lines, our safest delimiters
  return content.split(/[\s\r\n]/)
    // filter out any URLS and emails
    .filter(string => !rxUrls.test(string) && !string.includes('@'))
    // split all content by any character that does not form part of a word
    .flatMap(string => string.split(rxSeparators))
    .reduce((acc, string) => {
      // remove any number of single quotes that do not form part of a word i.e. 'y'all' > y'all
      string = string.replace(rxSingleQuotes, '')
      // we never want empty strings, so skip them
      if (string.length < 1) { return acc }
      // filter out single characters
      if (string.length === 1 && filterChars) { return acc }
      // filter out strings that are pure numbers
      if (isNaN(string)) { return acc.concat([string]) }
      // return accumulator without changing/operating on the current current string
      return acc
    }, [])
}

/**
 * A wrapper around cleanText that doesn't filter out characters and returns only the first result
 * of the cleaned content.
 *
 * @memberof TextMethods
 * @see cleanText
 * @param  {string} content - the chunk of text to operate i.e. '100 bouncy balls'
 * @returns {string} The first valid word i.e. 'bouncy'
 */
function cleanWord (content) {
  return cleanText(content, false)[0]
}

/**
 * Gets the exact mark that we will use to insert word suggestions based on a known index. The index
 * is relative to how many times the misspelt word appears inside the highlighter.
 *
 * @memberof TextMethods
 * @param  {string} misspeltWord - the misspelt word the mark is positioned over
 * @param  {number} index - the relative index of the misspelt word if it appears more than once
 * @param  {node} highlights - the node containing the highlights (child nodes) we will be searching
 * @returns {Node} The currently focused mark according to a known relative index
 */
function getCurrentMark (misspeltWord, index, highlights) {
  return [...highlights.children].reduce((acc, child) => {
    return child.textContent === misspeltWord ? acc.concat([child]) : acc
  }, [])[index]
}

/**
 * Get current word bounds based on selection (if selection present) or get boundaries based on the
 * current caret position within a text node
 *
 * @memberof TextMethods
 * @param  {node} node - the node to operate on
 * @returns {Array} An array containing the word, start index, and end index i.e. ['boom', 0, 3]
 */
function getCurrentWordBounds (node) {
  const content = getTextContent(node)
  const selection = getSelectionBounds(node)

  // selection is not collapsed if start and end not equal
  if (selection.start !== selection.end) {
    const word = content.slice(selection.start, selection.end)
    return [word, ...getRelativeBounds(word, content, selection.start)]
  }

  // use getWordBoundsFromCaret to ensure we return current word
  return getWordBoundsFromCaret(node, content, selection.start)
}

/**
 * Get the relative index of a Mark by matching the Word boundaries against the chunk of text being
 * spellchecked. This is needed for when duplicate misspelt words appear inside of a textarea. The
 * matching mark index is based on the Word boundaries, since a Word always has start and end values
 * relative to where it appears inside of the content it was created from.
 *
 * @memberof TextMethods
 * @param  {string} content - the chunk of text we will operate on
 * @param  {Word} word - the Word we are searching for
 * @returns {number} An index representing the exact position of a word inside content
 */
function getMatchingMarkIndex (content, word) {
  if (!word.isValid() || !content) {
    console.warn('Multidict: cannot get mark index of undefined content or invalid word')
    return -1
  }

  let searchIndex = 0
  let markIndex = 0
  let found = false

  while (!found) {
    searchIndex = content.indexOf(word.text, searchIndex)
    if (searchIndex !== -1) {
      const start = searchIndex
      const end = searchIndex + word.length
      if (start === word.start && end === word.end) {
        found = true
      } else {
        searchIndex += word.length
        // don't update markIndex unless we've matched a whole word
        if (isWholeWord(word.text, content, start)) {
          markIndex++
        }
      }
    } else {
      console.warn('Multidict: could not find matching mark index inside given content!')
      return -1
    }
  }
  return markIndex
}

/**
 * Get the relative boundaries of a word given a specific start index
 *
 * @memberof TextMethods
 * @param  {string} word - the word we are searching for within content
 * @param  {string} content - the chunk of text being operated on
 * @param  {number} startIndex - the index from where to begin searching content from
 * @returns {Array|undefined} An array containing the start and end bounds of a word or undefined
 * if word or content undefined
 */
function getRelativeBounds (word, content, startIndex = 0) {
  if (!word || !content || content.indexOf(word, startIndex) === -1) {
    console.warn(`Multidict: cannot get relative boundaries of ${word} in ${content}`)
    return
  }
  const start = content.indexOf(word, startIndex)
  return [start, start + word.length]
}

/**
 * Get the selection boundaries of a given node. For now only supports textareas.
 *
 * @memberof TextMethods
 * @param  {Node} node - the node we will operate on
 * @returns {Object} An object with start and end parameters or an empty object if node is not
 * supported
 */
function getSelectionBounds (node) {
  if (node.nodeName !== 'TEXTAREA') {
    console.warn('Multidict: can only get selection bounds of textareas')
    return {}
  }
  return { start: node.selectionStart, end: node.selectionEnd }
}

/**
 * Conditionally return the text content of a node (including line breaks) based on the node type.
 *
 * @memberof TextMethods
 * @param  {Node} node - the node we will operate on
 * @returns {string} The text content (including line breaks) of the node
 */
function getTextContent (node) {
  return node.nodeName === 'TEXTAREA'
    ? node.value
    : node.innerText || node.textContent
}

/**
 * Gets the currently focused word and it's boundaries based on caret position.
 *
 * @memberof TextMethods
 * @param  {Node} node - the node we will operate on
 * @param  {string} text - the text content used to generate a word selection from
 * @param  {number} startIndex - the index from where we will begin building our word
 * @returns {Array} An array containing the word, start index, and end index
 */
function getWordBoundsFromCaret (node, text, startIndex) {
  if (!(startIndex >= 0)) {
    console.warn('Multidict: cannot get current word boundaries if start index negative')
    return ['']
  }

  const boundaries = {
    start: startIndex,
    end: startIndex
  }

  if (text) {
    let found = false
    while (!found) {
      const start = boundaries.start
      const end = boundaries.end
      const prevChar = text.charAt(start - 1)
      const nextChar = text.charAt(end)

      if (!_isWordBoundary(prevChar)) {
        boundaries.start--
      }

      if (!_isWordBoundary(nextChar)) {
        boundaries.end++
      }

      // if we haven't moved either boundary, we have our word
      if (start === boundaries.start && end === boundaries.end) {
        found = true
        if (start < end) {
          const word = cleanWord(text.slice(start, end))
          return [word, ...getRelativeBounds(word, text, start)]
        }
        // start and end can be equal if caret positioned between 2 word boundaries i.e. ' | '
        return ['', start, end]
      }
    }
  } else {
    // for an empty text box and unhandled cases we return no text, start and end 0
    return ['', 0, 0]
  }
}

/**
 * Search content for an exact match of the given word as of start (or elsewhere).
 *
 * @memberof TextMethods
 * @param  {string} word - the sequence of characters we are testing appears as a whole word
 * @param  {string} content - text content to be tested with regex
 * @param  {number} start - mandatory index that represents exactly where word begins inside content
 * @returns {boolean} True if a word appears at least once as a whole word (as of start)
 */
function isWholeWord (word, content, start) {
  const prevChar = content.charAt(start - 1)
  const nextChar = content.charAt(start + word.length)

  return _isWordBoundary(prevChar) && _isWordBoundary(nextChar)
}

/**
 * Check whether or not a character is a word boundary or not. If char matches any separators, is
 * undefined, or is a zero length string we return true
 *
 * @private
 * @memberof TextMethods
 * @param  {string} char - a one character length string
 * @returns {boolean} True if the char is undefined, a 0 length string, or matches is a word boundary
 */
function _isWordBoundary (char) {
  if (typeof char === 'undefined' || (typeof char === 'string' && char.length === 0)) {
    return true
  }

  const rxSeparators = /[\s\r\n.,:;!¿?_<>{}()[\]"`´^$°§½¼³%&¬+=*~#|/\\]/

  return rxSeparators.test(char)
}

/**
 * Replace a Word inside of content with replacement by using the Word's boundaries
 *
 * @memberof TextMethods
 * @param  {string} content - the content on which to operate on
 * @param  {Word} word - the Word we are going to replace
 * @param  {string} replacement - the string of text used to replace the Word
 * @returns {string} The modified content that has the replacement in place of the Word
 */
function replaceInText (content, word, replacement) {
  if (!replacement || typeof replacement !== 'string') {
    throw new TypeError('replacement must be of type string and have a value')
  }
  return `${content.slice(0, word.start)}${replacement}${content.slice(word.end)}`
}

/**
 * Used to store and then restore a previous selection/caret position.
 *
 * @memberof TextMethods
 * @param  {Node} node - a node that has start and end selection values
 * @returns {function(): undefined} The function that will be use to later restore the selection
 */
function storeSelection (node) {
  const storedSelection = getSelectionBounds(node)

  /**
   * Call this function to restore the previous selection range stored when calling storeSelection
   *
   * @memberof TextMethods
   */
  return function () {
    node.focus()
    node.setSelectionRange(storedSelection.start, storedSelection.end)
  }
}

module.exports = {
  cleanText,
  cleanWord,
  getCurrentMark,
  getCurrentWordBounds,
  getMatchingMarkIndex,
  getRelativeBounds,
  getTextContent,
  getSelectionBounds,
  getWordBoundsFromCaret,
  isWholeWord,
  replaceInText,
  storeSelection
}