Class: Candle::PatternEntityRecognizer

Inherits:
Object
  • Object
show all
Defined in:
lib/candle/ner.rb

Overview

Pattern-based entity recognizer for custom entities

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(entity_type, patterns = []) ⇒ PatternEntityRecognizer

Returns a new instance of PatternEntityRecognizer.



180
181
182
183
# File 'lib/candle/ner.rb', line 180

def initialize(entity_type, patterns = [])
  @entity_type = entity_type
  @patterns = patterns
end

Instance Attribute Details

#entity_typeObject (readonly)

Returns the value of attribute entity_type.



178
179
180
# File 'lib/candle/ner.rb', line 178

def entity_type
  @entity_type
end

#patternsObject (readonly)

Returns the value of attribute patterns.



178
179
180
# File 'lib/candle/ner.rb', line 178

def patterns
  @patterns
end

Instance Method Details

#add_pattern(pattern) ⇒ Object

Add a pattern (String or Regexp)



186
187
188
189
# File 'lib/candle/ner.rb', line 186

def add_pattern(pattern)
  @patterns << pattern
  self
end

#recognize(text, tokenizer = nil) ⇒ Object

Recognize entities using patterns



192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/candle/ner.rb', line 192

def recognize(text, tokenizer = nil)
  entities = []
  
  # Limit text length to prevent ReDoS on very long strings
  # This is especially important for Ruby < 3.2
  max_length = 1_000_000  # 1MB of text
  if text.length > max_length
    warn "PatternEntityRecognizer: Text truncated from #{text.length} to #{max_length} chars for safety"
    text = text[0...max_length]
  end
  
  @patterns.each do |pattern|
    regex = pattern.is_a?(Regexp) ? pattern : Regexp.new(pattern)
    
    text.scan(regex) do |match|
      match_text = $&
      match_start = $~.offset(0)[0]
      match_end = $~.offset(0)[1]
      
      entities << {
        text: match_text,
        label: @entity_type,
        start: match_start,
        end: match_end,
        confidence: 1.0,
        source: "pattern"
      }
    end
  end
  
  entities
end