php 英文分句/分段落
英文分句/分段落 php环境,指出一个原本分句的时候的误区,分句不用考虑小数点,不用考虑域名,因为标准的句子是句号后面加空格的,唯一要考虑的就是Mr. Li 这种。先采用分段落的
php环境,指出一个原本分句的时候的误区,分句不用考虑小数点,不用考虑域名,因为标准的句子是句号后面加空格的,唯一要考虑的就是Mr. Li 这种。
先采用分段落的方式是考虑到有些引用采用冒号结尾。
<?php /*TWWY'S ART*/ function break_passage($text){ //分割段落 return preg_split("/(\\r|\\n|\\r\\n)/", $text, -1, PREG_SPLIT_NO_EMPTY); } function break_sentence($text){ //分割句子 英文的句号后面必须有空格 $re = '/# Split sentences on whitespace between them. (?<= # Begin positive lookbehind. [.!?] # Either an end of sentence punct, | [.!?][\\'"] # or end of sentence punct and quote. ) # End positive lookbehind. (?<! # Begin negative lookbehind. Mr\\. # Skip either "Mr." | Mrs\\. # or "Mrs.", | Ms\\. # or "Ms.", | Jr\\. # or "Jr.", | Dr\\. # or "Dr.", | Prof\\. # or "Prof.", | Sr\\. # or "Sr.", # or... (you get the idea). ) # End negative lookbehind. \\s+ # Split on whitespace between sentences. /ix'; $sentences = preg_split($re, $text, -1, PREG_SPLIT_NO_EMPTY); return $sentences; } function get_sentence($text){ //先分割段落再分割句子 [推荐] $passage = break_passage($text); $return = array(); foreach ($passage as $key => $value) $return = array_merge($return, break_sentence($value)); return $return; } ?> //该片段来自于http://outofmemory.cn
收藏文章
精彩图集
精彩文章