NAME Lingua::ZH::Jieba - Perl wrapper for CppJieba (Chinese text segmentation) VERSION version 0.007 SYNOPSIS use Lingua::ZH::Jieba; binmode STDOUT, ":utf8"; my $jieba = Lingua::ZH::Jieba->new(); # default cut (切è¯ï¼ŒMP/HMMæ··åˆæ–¹æ³•) my $words = $jieba->cut("ä»–æ¥åˆ°äº†ç½‘易æç ”å¤§åŽ¦"); print join('/', @$words), "\n"; # ä»–/æ¥åˆ°/了/网易/æç ”/大厦 # cut without HMM (切è¯ï¼ŒMP方法) my $words_nohmm = $jieba->cut( "ä»–æ¥åˆ°äº†ç½‘易æç ”å¤§åŽ¦", { no_hmm => 1 } ); print join('/', @$words_nohmm), "\n"; # ä»–/æ¥åˆ°/了/网易/æ/ç ”/大厦 # cut all (Full方法,切出所有è¯å…¸é‡Œçš„è¯è¯) my $words_cutall = $jieba->cut( "我æ¥åˆ°åŒ—京清åŽå¤§å¦", { cut_all => 1 } ); print join('/', @$words_cutall), "\n"; # 我/æ¥åˆ°/北京/清åŽ/清åŽå¤§å¦/åŽå¤§/å¤§å¦ # cut for search (先用Mix方法切è¯ï¼Œå¯¹äºŽåˆ‡å‡ºçš„较长è¯å†ç”¨Full方法) my $words_cut4search = $jieba->cut_for_search( "å°æ˜Žç¡•å£«æ¯•ä¸šäºŽä¸å›½ç§‘å¦é™¢è®¡ç®—所,åŽåœ¨æ—¥æœ¬äº¬éƒ½å¤§å¦æ·±é€ " ); print join('/', @$words_cut4search), "\n"; # å°æ˜Ž/硕士/毕业/于/ä¸å›½/科å¦/å¦é™¢/科å¦é™¢/ä¸å›½ç§‘å¦é™¢/计算/计算所/,/åŽ/在/日本/京都/大å¦/日本京都大å¦/æ·±é€ # get word offset and length with cut_ex() or cut_for_search_ex() my $words_ex = $jieba->cut_ex("ä»–æ¥åˆ°äº†ç½‘易æç ”å¤§åŽ¦"); # [ # [ "ä»–", 0, 1 ], # [ "æ¥åˆ°", 1, 2 ], # [ "了", 3, 1 ], # [ "网易", 4, 2 ], # [ "æç ”", 6, 2 ], # [ "大厦", 8, 2 ], # ] # part-of-speech tagging (è¯æ€§æ ‡æ³¨) my $word_pos_tags = $jieba->tag("我是è“翔技工拖拉机å¦é™¢æ‰‹æ‰¶æ‹–拉机专业的。"); for my $pair (@$word_pos_tags) { my ($word, $part_of_speech) = @$pair; print "$word:$part_of_speech\n"; } # 我:r # 是:v # è“ç¿”:nz # 技工:n # 拖拉机:n # ... # keyword extraction (关键è¯æå–) my $extractor = $jieba->extractor(); my $word_score = $extractor->extract( "我是拖拉机å¦é™¢æ‰‹æ‰¶æ‹–拉机专业的。ä¸ç”¨å¤šä¹…,我就会å‡èŒåŠ 薪,当上CEO,走上人生巅峰。", 5 ); for my $pair (@$word_scores) { my ($word, $score) = @$pair; printf "%s:%.3f\n", $word, $score; } # CEO:11.739 # å‡èŒ:10.856 # åŠ è–ª:10.643 # 手扶拖拉机:10.009 # å·…å³°:9.494 # insert user word (动æ€å¢žåŠ 用户è¯) my $words_before_insert = $jieba->cut("男默女泪"); print join('/', @$words_before_insert), "\n"; # 男默/女泪 $jieba->insert_user_word("男默女泪"); my $words_after_insert = $jieba->cut("男默女泪"); print join('/', @$words_after_insert), "\n"; # 男默女泪 DESCRIPTION This module is the Perl wrapper for CppJieba, which is a C++ implementation of the Jieba Chinese text segmentation library. The Perl/C++ binding is generated via SWIG. The module may contain several packages. Unless stated otherwise, you only need to use Lingua::ZH::Jieba; in your programs. At present this module is still in alpha state. Its interface is subject to change in future, although I will keep compatibilities if possible. CONSTRUCTOR new my $jieba = Lingua::ZH::Jieba->new; By default constructor would use data files from "share" dir of its installation. But it's possible to override any of the data files like below. my $jieba = Lingua::ZH::Jieba->new( { dict_path => $my_dict_path, hmm_path => $my_hmm_path, user_dict_path => $my_user_dict_path, idf_path => $my_idf_path, stop_word_path => $my_stop_word, } ); # if you just would like override user dict my $jieba = Lingua::ZH::Jieba->new( { user_dict_path => $my_user_dict_path, } ); METHODS cut my $words = $jieba->cut($sentence); Default cut mode. Returns an arrayref of utf8 strings of words cut from the sentence. my $words = $jieba->cut($sentence, { no_hmm => 1 }); Cut without HMM mode. my $words = $jieba->cut($sentence, { cut_all => 1 }); Cut all possible words in dictionary. cut_ex my $words_ex = $jieba->cut_ex($sentence); Similar to cut(), but returns an arrayref of complex data. Each element in the result arrayref is [ word, offset, length ]. cut_for_search my $words = $jieba->cut_for_search($sentence); my $words_nohmm = $jieba->cut_for_search($sentence, { no_hmm => 1 }); cut_for_search_ex my $words_ex = $jieba->cut_for_search_ex($sentence); Similar to cut_for_search(), but returns an arrayref of complex data. Each element in the result arrayref is [ word, offset, length ]. tag my $word_pos_tags = $jieba->tag($sentence); for my $pair (@$word_pos_tags) { my ($word, $part_of_speech) = @$pair; ... } POS (part-of-speech) tagging. Returns an arrayref of which each element is in the form of [ $word, $part_of_speech ]. insert_user_word $jieba->insert_user_word($word); Dynamically inserts a user word. extractor my $extractor = $jieba->extractor(); Get the keyword extractor object. For more about the extractor, see Lingua::ZH::Jieba::KeywordExtractor. SEE ALSO https://github.com/fxsjy/jieba - Jieba, the Chinese text segmentation library https://github.com/yanyiwu/cppjieba - CppJieba, Jieba implemented in C++ http://www.swig.org - SWIG, the Simplified Wrapper and Interface Generator ACKNOWLEDGEMENTS Thanks to Junyi Sun, and Yanyi Wu. This piece of Perl library would not be existing without their work on jieba and CppJieba. AUTHOR Stephan Loyd <stephanloyd9@gmail.com> COPYRIGHT AND LICENSE This software is copyright (c) 2017-2023 by Stephan Loyd. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself.