- Issue created by @colinstillwell
When the HTML filter strips tags, words across adjacent block elements can concatenate.
Example:
foo.</p><p>Bar becomes foo.Bar.
This causes valid queries like Bar to return no results, reducing the accuracy of search results in Typesense indexes.
<p>foo.</p><p>Bar</p>Bar.Actual result: No results, as the text becomes foo.Bar.
Expected result: Bar should be matched.
Two possible approaches:
Option A: Preprocessor: Preserve word boundaries
Option B: Replacement: HTML filter (Typesense)
In our project we solved this with option A:
namespace Drupal\search_api_typesense\Plugin\search_api\processor;
use Drupal\search_api\Processor\FieldsProcessorPluginBase;
/**
* Preserves word boundaries when HTML is removed.
*
* Inserts spacing at HTML element boundaries so that when tags are stripped by
* the HTML filter, words do not concatenate across block or void elements.
*
* Pair with "Search API Trim Whitespace" if you want extra spaces collapsed.
*
* @SearchApiProcessor(
* id = "search_api_typesense_preserve_word_boundaries",
* label = @Translation("Preserve word boundaries"),
* description = @Translation("Insert spacing at HTML boundaries before tags are removed to prevent word concatenation. Place before the HTML filter on text fields."),
* stages = {
* "pre_index_save" = -50,
* "preprocess_index" = -50,
* "preprocess_query" = -50
* }
* )
*/
class SearchApiTypesensePreserveWordBoundaries extends FieldsProcessorPluginBase {
/**
* {@inheritdoc}
*/
protected function processFieldValue(&$value, $type) {
if (!$this->getDataTypeHelper()->isTextType($type, ['text', 'string'])) {
return;
}
static $regex_space_before_containers = NULL;
static $regex_space_after_containers = NULL;
static $regex_space_before_voids = NULL;
static $regex_space_after_voids = NULL;
if (
$regex_space_before_containers === NULL
|| $regex_space_after_containers === NULL
|| $regex_space_before_voids === NULL
|| $regex_space_after_voids === NULL
) {
// These would become configurable in a full implementation.
$container_elements = [
'blockquote', 'dd', 'dl', 'dt',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'li', 'ol', 'p', 'ul',
];
$void_elements = ['br', 'hr'];
$escape = fn($arr) => array_map(fn($t) => preg_quote($t, '/'), $arr);
$container_pattern = implode('|', $escape($container_elements));
$void_pattern = implode('|', $escape($void_elements));
$regex_space_before_containers = '/(?<=\S)(<(?:' . $container_pattern . ')\b[^>]*>)/iu';
$regex_space_after_containers = '/(<\/(?:' . $container_pattern . ')>)(?!\s)/iu';
$regex_space_before_voids = '/(?<=\S)<(?:' . $void_pattern . ')\b[^>]*>/iu';
$regex_space_after_voids = '/<(?:' . $void_pattern . ')\b[^>]*>(?=\S)/iu';
}
// Insert space before container open tags following non-whitespace.
$value = preg_replace($regex_space_before_containers, ' $1', $value);
// Insert space after container close tags preceding non-whitespace.
$value = preg_replace($regex_space_after_containers, '$1 ', $value);
// Space around void elements adjacent to text.
$value = preg_replace($regex_space_before_voids, ' $0', $value);
$value = preg_replace($regex_space_after_voids, '$0 ', $value);
}
}
Active
1.0
Code