VietnameseAnalyzer.rb

By Trieu February 14, 2011



require 'unicode'



# Normalizes token text to lower case.

class UnicodeLowerCaseFilter

def initialize(token_stream)

@input = token_stream

end



def text=(text)

@input.text = text   

end



def next()

t = @input.next()



if (t == nil)

return nil

end



t.text = Unicode.downcase(t.text)

return t

end

end



class VietnameseAnalyzer < Ferret::Analysis::Analyzer   include Ferret::Analysis      # Standard Character mappings to remove all special characters   # so only default ASCII characters get indexed   CHARACTER_MAPPINGS = {     ['á','à','ạ','ả','ã','ă','ắ','ằ','ặ','ẳ','ẵ','â','ấ','ầ','ậ','ẩ','ẫ'] => 'a',

['đ'] => 'd',

['é','è','ẹ','ẻ','ẽ','ê','ế','ề','ệ','ể','ễ'] => 'e',

['í','ì','ị','ỉ','ĩ'] => 'i',

['ó','ò','ọ','ủ','õ','ơ','ớ','ờ','ợ','ở','ỡ','ô','ố','ồ','ộ','ổ','ỗ'] => 'o',

['ú','ù','ụ','ů','ũ','ư','ứ','ừ','ự','ử','ữ'] => 'u',

['ý','ỳ','ỵ','ỷ','ỹ'] => 'y',

} unless defined?(CHARACTER_MAPPINGS)



def token_stream(field, str)

ts = StandardTokenizer.new(str)

ts = UnicodeLowerCaseFilter.new(ts)

ts = MappingFilter.new(ts, CHARACTER_MAPPINGS)

end

end

How to implement a search engine for Vietnamese language.
Cool, I found a solution!

DATAISM ONE

VietnameseAnalyzer.rb

Comments

Popular posts from this blog

Cá nhân hóa trải nghiệm du lịch với Behavior2Vector và CDP

Handling Zalo Webhooks with AWS Lambda, SQS, and S3: A Scalable Serverless Architecture

Vì sao chúng ta cần ứng dụng Dataism cho đời sống