Class: Candle::GazetteerEntityRecognizer

Inherits:
Object
  • Object
show all
Defined in:
lib/candle/ner.rb

Overview

Gazetteer-based entity recognizer

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(entity_type, terms = [], case_sensitive: false) ⇒ GazetteerEntityRecognizer

Returns a new instance of GazetteerEntityRecognizer.



207
208
209
210
211
# File 'lib/candle/ner.rb', line 207

def initialize(entity_type, terms = [], case_sensitive: false)
  @entity_type = entity_type
  @case_sensitive = case_sensitive
  @terms = build_term_set(terms)
end

Instance Attribute Details

#case_sensitiveObject (readonly)

Returns the value of attribute case_sensitive.



205
206
207
# File 'lib/candle/ner.rb', line 205

def case_sensitive
  @case_sensitive
end

#entity_typeObject (readonly)

Returns the value of attribute entity_type.



205
206
207
# File 'lib/candle/ner.rb', line 205

def entity_type
  @entity_type
end

#termsObject (readonly)

Returns the value of attribute terms.



205
206
207
# File 'lib/candle/ner.rb', line 205

def terms
  @terms
end

Instance Method Details

#add_terms(terms) ⇒ Object

Add terms to the gazetteer



214
215
216
217
218
# File 'lib/candle/ner.rb', line 214

def add_terms(terms)
  terms = [terms] unless terms.is_a?(Array)
  terms.each { |term| @terms.add(normalize_term(term)) }
  self
end

#load_from_file(filepath) ⇒ Object

Load terms from file



221
222
223
224
225
226
227
# File 'lib/candle/ner.rb', line 221

def load_from_file(filepath)
  File.readlines(filepath).each do |line|
    term = line.strip
    add_terms(term) unless term.empty? || term.start_with?("#")
  end
  self
end

#recognize(text, tokenizer = nil) ⇒ Object

Recognize entities using the gazetteer



230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
# File 'lib/candle/ner.rb', line 230

def recognize(text, tokenizer = nil)
  entities = []
  normalized_text = @case_sensitive ? text : text.downcase
  
  @terms.each do |term|
    pattern = @case_sensitive ? term : term.downcase
    pos = 0
    
    while (idx = normalized_text.index(pattern, pos))
      # Check word boundaries
      prev_char = idx > 0 ? text[idx - 1] : " "
      next_char = idx + pattern.length < text.length ? text[idx + pattern.length] : " "
      
      if word_boundary?(prev_char) && word_boundary?(next_char)
        entities << {
          "text" => text[idx, pattern.length],
          "label" => @entity_type,
          "start" => idx,
          "end" => idx + pattern.length,
          "confidence" => 1.0,
          "source" => "gazetteer"
        }
      end
      
      pos = idx + 1
    end
  end
  
  entities
end