Class: Candle::GazetteerEntityRecognizer

Inherits:
Object
  • Object
show all
Defined in:
lib/candle/ner.rb

Overview

Gazetteer-based entity recognizer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(entity_type, terms = [], case_sensitive: false) ⇒ GazetteerEntityRecognizer

Returns a new instance of GazetteerEntityRecognizer.



230
231
232
233
234
# File 'lib/candle/ner.rb', line 230

def initialize(entity_type, terms = [], case_sensitive: false)
  @entity_type = entity_type
  @case_sensitive = case_sensitive
  @terms = build_term_set(terms)
end

Instance Attribute Details

#case_sensitiveObject (readonly)

Returns the value of attribute case_sensitive.



228
229
230
# File 'lib/candle/ner.rb', line 228

def case_sensitive
  @case_sensitive
end

#entity_typeObject (readonly)

Returns the value of attribute entity_type.



228
229
230
# File 'lib/candle/ner.rb', line 228

def entity_type
  @entity_type
end

#termsObject (readonly)

Returns the value of attribute terms.



228
229
230
# File 'lib/candle/ner.rb', line 228

def terms
  @terms
end

Instance Method Details

#add_terms(terms) ⇒ Object

Add terms to the gazetteer



237
238
239
240
241
# File 'lib/candle/ner.rb', line 237

def add_terms(terms)
  terms = [terms] unless terms.is_a?(Array)
  terms.each { |term| @terms.add(normalize_term(term)) }
  self
end

#load_from_file(filepath) ⇒ Object

Load terms from file



244
245
246
247
248
249
250
# File 'lib/candle/ner.rb', line 244

def load_from_file(filepath)
  File.readlines(filepath).each do |line|
    term = line.strip
    add_terms(term) unless term.empty? || term.start_with?("#")
  end
  self
end

#recognize(text, tokenizer = nil) ⇒ Object

Recognize entities using the gazetteer



253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/candle/ner.rb', line 253

def recognize(text, tokenizer = nil)
  entities = []
  normalized_text = @case_sensitive ? text : text.downcase
  
  @terms.each do |term|
    pattern = @case_sensitive ? term : term.downcase
    pos = 0
    
    while (idx = normalized_text.index(pattern, pos))
      # Check word boundaries
      prev_char = idx > 0 ? text[idx - 1] : " "
      next_char = idx + pattern.length < text.length ? text[idx + pattern.length] : " "
      
      if word_boundary?(prev_char) && word_boundary?(next_char)
        entities << {
          text: text[idx, pattern.length],
          label: @entity_type,
          start: idx,
          end: idx + pattern.length,
          confidence: 1.0,
          source: "gazetteer"
        }
      end
      
      pos = idx + 1
    end
  end
  
  entities
end