#!/usr/bin/env python3
# -*- coding:utf-8 -*-
__author__ = 'Mu Yang <http://muyang.pro>'
__copyright__ = '2018-2020 CKIP Lab'
__license__ = 'CC BY-NC-SA 4.0'
import datetime as _datetime
import os as _os
import sys as _sys
import tempfile as _tempfile
import warnings as _warnings
import ckip_classic as _about
[docs]def create_ws_lex(*lex_list):
"""Generate CKIP word segmentation lexicon file.
Parameters
----------
*lex_list : Tuple[str, str]
the lexicon word and its POS-tag.
Returns
-------
lex_file : str
the name of the lexicon file.
f_lex : TextIO
the file object.
.. attention::
Remember to close **f_lex** manually.
"""
f_lex = _tempfile.NamedTemporaryFile(mode='w')
lex_file = f_lex.name
for lex in lex_list:
print('\t'.join(lex), file=f_lex)
f_lex.flush()
return lex_file, f_lex
[docs]def create_ws_ini(*, # pylint: disable=too-many-statements
data2_dir=None,
lex_file=None,
new_style_format=False,
show_category=True,
sentence_max_word_num=80,
**options,
):
"""Generate CKIP word segmentation config.
Parameters
----------
data2_dir : str
the path to the folder "Data2/".
lex_file : str
the path to the user-defined lexicon file.
new_style_format : bool
split sentences by newline characters ("\\\\n") rather than punctuations.
show_category : bool
show part-of-speech tags.
sentence_max_word_num : int
maximum number of words per sentence.
Returns
-------
ini_file : str
the name of the config file.
f_ini : TextIO
the file object.
.. attention::
Remember to close **f_ini** manually.
"""
# pylint: disable=invalid-name
f_ini = _tempfile.NamedTemporaryFile(mode='w')
ini_file = f_ini.name
if data2_dir is None:
data2_dir = _os.getenv('CKIPWS_DATA2')
if not data2_dir:
data2_dir = _os.path.join(_sys.prefix, 'share', 'ckip_classic', 'Data2')
if not _os.path.isdir(data2_dir):
_warnings.warn('Invalid data2_dir (%s)' % data2_dir)
data2_dir = 'Data2'
print(';PyCkip {version}'.format(version=_about.__version__), file=f_ini)
print(';ws.ini', file=f_ini)
print(';Auto-generated {date}'.format(date=_datetime.datetime.now()), file=f_ini)
print('', file=f_ini)
print('[ConsoleLogger]', file=f_ini)
print('Name=ConsoleLogger', file=f_ini)
print('', file=f_ini)
if lex_file:
print('[CTextLexicon]', file=f_ini)
print('Name=TextLex', file=f_ini)
print('FileName={lex_file}'.format(lex_file=lex_file), file=f_ini)
print('', file=f_ini)
print('[CLexicon]', file=f_ini)
print('Name=Lex', file=f_ini)
print('FileName={data2_dir}/Lexicon.Dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CALexicon]', file=f_ini)
print('Name=CALex', file=f_ini)
print('FileName={data2_dir}/CALexicon.Dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CDMMergedParser]', file=f_ini)
print('Name=DMMergedParser', file=f_ini)
print('GenerateMaxLengthWordOnly=no', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CDMSplittedParser]', file=f_ini)
print('Name=DMSplittedParser', file=f_ini)
print('GenerateMaxLengthWordOnly=no', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CHTRDRule3]', file=f_ini)
print('Name=RD3', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CHTRDRule6]', file=f_ini)
print('Name=RD6', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CHTRDRule7]', file=f_ini)
print('Name=RD7', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CHTForeignWord]', file=f_ini)
print('Name=FW', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CHTBoundWord]', file=f_ini)
print('Name=BW', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CMaxMatch]', file=f_ini)
print('Name=MaxMatch', file=f_ini)
print('WindowSize=3', file=f_ini)
print('', file=f_ini)
print('[CHTCategoryPredictor]', file=f_ini)
print('Name=CatPred', file=f_ini)
print('PrefixCategoryFileName={data2_dir}/CatPredictData/PrefixCategoryFreq'.format(data2_dir=data2_dir), file=f_ini)
print('PrefixFileName={data2_dir}/CatPredictData/PrefixFreq'.format(data2_dir=data2_dir), file=f_ini)
print('SuffixCategoryFileName={data2_dir}/CatPredictData/SuffixCategoryFreq'.format(data2_dir=data2_dir), file=f_ini)
print('SuffixFileName={data2_dir}/CatPredictData/SuffixFreq'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CStatProb]', file=f_ini)
print('Name=CAProb1', file=f_ini)
print('FileName={data2_dir}/CAStat-w(0)c(0)-w(-1).dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CStatProb]', file=f_ini)
print('Name=CAProb2', file=f_ini)
print('FileName={data2_dir}/CAStat-w(0)c(0)-w(1).dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CStatProb]', file=f_ini)
print('Name=CAProb3', file=f_ini)
print('FileName={data2_dir}/CAStat-w(0)c(0)-w(-2).dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CStatProb]', file=f_ini)
print('Name=CAProb4', file=f_ini)
print('FileName={data2_dir}/CAStat-w(0)c(0)-w(2).dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CoveringAmbiguity]', file=f_ini)
print('Name=CA', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('CoveringAmbiguityLexiconName=CALex', file=f_ini)
print('InsertSplittedWordsOnly=false', file=f_ini)
print('StatisticProbability1=CAProb1', file=f_ini)
print('StatisticProbability2=CAProb2', file=f_ini)
print('StatisticProbability3=CAProb3', file=f_ini)
print('StatisticProbability4=CAProb4', file=f_ini)
print('', file=f_ini)
print('[CStatProb]', file=f_ini)
print('Name=Prob1', file=f_ini)
print('FileName={data2_dir}/CKIPWStatistic-w(-1)-w(0).dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CStatProb]', file=f_ini)
print('Name=Prob2', file=f_ini)
print('FileName={data2_dir}/CKIPWStatistic-c(-1)-c(0).dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CStatProb]', file=f_ini)
print('Name=Prob3', file=f_ini)
print('FileName={data2_dir}/CKIPWStatistic-c(0)-w(0).dat'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CSimpleProbModel]', file=f_ini)
print('Name=ProbModel', file=f_ini)
print('StatisticProbability1=Prob1', file=f_ini)
print('StatisticProbability2=Prob2', file=f_ini)
print('StatisticProbability3=Prob3', file=f_ini)
print('LexiconName=Lex', file=f_ini)
if lex_file:
print('TextLexiconName=TextLex', file=f_ini)
print('AdjustProb3=true', file=f_ini)
print('CoveringAmbiguityLexiconName=CALex', file=f_ini)
print('CategoryPredictor=CatPred', file=f_ini)
print('KeepBestCategory=true', file=f_ini)
print('SimplifiedCategory=false', file=f_ini)
print('', file=f_ini)
print('[CDetectMonosyllabicMorpheme]', file=f_ini)
print('Name=DMM', file=f_ini)
print('ApplyDefaultHeuristicDetectRule=yes', file=f_ini)
print('InitDataPath={data2_dir}/uwea/qrulepool/'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[ChineseName]', file=f_ini)
print('Name=CN', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('InitDataPath={data2_dir}/uwea/data/'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CForeignName]', file=f_ini)
print('Name=FN', file=f_ini)
print('LexiconName=Lex', file=f_ini)
if lex_file:
print('TextLexiconName=TextLex', file=f_ini)
print('InitDataPath={data2_dir}/uwea/data/'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CompoundWord]', file=f_ini)
print('Name=CW', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('InitDataPath={data2_dir}/uwea/data/'.format(data2_dir=data2_dir), file=f_ini)
print('', file=f_ini)
print('[CStatisticWord]', file=f_ini)
print('Name=SW', file=f_ini)
print('LexiconName=Lex', file=f_ini)
if lex_file:
print('TextLexiconName=TextLex', file=f_ini)
print('CategoryPredictor=CatPred', file=f_ini)
print('InitDataPath={data2_dir}/uwea/data/'.format(data2_dir=data2_dir), file=f_ini)
print('ApplyRule=639', file=f_ini)
print('', file=f_ini)
print('[CAffixCombiner]', file=f_ini)
print('Name=AC', file=f_ini)
print('LexiconName=Lex', file=f_ini)
if lex_file:
print('TextLexiconName=TextLex', file=f_ini)
print('CategoryPredictor=CatPred', file=f_ini)
print('', file=f_ini)
print('[CSimilarStructureCombiner]', file=f_ini)
print('Name=SSC', file=f_ini)
print('AutoCombineWordLen=2', file=f_ini)
print('HeuristicCombinedWordMaxLen=3', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('CategoryPredictor=CatPred', file=f_ini)
print('', file=f_ini)
print('[COnlineLexicon]', file=f_ini)
print('Name=OnlineLexForUWGen', file=f_ini)
print('', file=f_ini)
print('[CUnknownWord]', file=f_ini)
print('Name=UW', file=f_ini)
print('UnknownWordGeneratorList=DMM CN FN CW SW SSC', file=f_ini)
print('OnlineLexicon=OnlineLexForUWGen', file=f_ini)
print('', file=f_ini)
if lex_file:
print('[CLexWordGenerator]', file=f_ini)
print('Name=myLWGen', file=f_ini)
print('LexiconName=TextLex', file=f_ini)
print('', file=f_ini)
print('[CLexWordGenerator]', file=f_ini)
print('Name=LWGen', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('', file=f_ini)
print('[CLexWordGenerator]', file=f_ini)
print('Name=LWGen1', file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('MaxWordLen=1', file=f_ini)
print('', file=f_ini)
print('[CLexWordGenerator]', file=f_ini)
print('Name=UWGen', file=f_ini)
print('LexiconName=OnlineLexForUWGen', file=f_ini)
print('', file=f_ini)
print('[CSimpleProbModelResult]', file=f_ini)
print('Name=ProbModelResult', file=f_ini)
print('ProbabilityModelName=ProbModel', file=f_ini)
print('NewStyleFormat={NewStyleFormat}'.format(NewStyleFormat=str(new_style_format).lower()), file=f_ini)
print('ShowCategory={ShowCategory}'.format(ShowCategory=str(show_category).lower()), file=f_ini)
print('LexiconName=Lex', file=f_ini)
print('CategoryPredictor=CatPred', file=f_ini)
print('KeepExistingWord=true', file=f_ini)
print('FeatureAssigner=FA', file=f_ini)
print('FilterBadWord=false', file=f_ini)
print('', file=f_ini)
print('[CDetectDMForPostProcess]', file=f_ini)
print('Name=DDMFPP', file=f_ini)
print('', file=f_ini)
print('[CRemoveWordToBePostProcessed]', file=f_ini)
print('Name=RWTBPP', file=f_ini)
print('', file=f_ini)
handler_list = [
'LWGen', 'myLWGen', 'DMMergedParser', 'RD3', 'RD6', 'RD7', 'FW', 'BW', 'MaxMatch', 'ProbModel', 'UW', 'DDMFPP',
'LWGen', 'UWGen', 'RWTBPP', 'LWGen', 'myLWGen', 'DMSplittedParser', 'BW', 'MaxMatch', 'ProbModel', 'CA'
]
if not lex_file:
while 'myLWGen' in handler_list:
handler_list.remove('myLWGen')
print('[CWordSegmentor]', file=f_ini)
print('Name=MainWS', file=f_ini)
print('ArticleMaxLineNum=300', file=f_ini)
print('SentenceMaxWordNum={sentence_max_word_num}'.format(sentence_max_word_num=sentence_max_word_num), file=f_ini)
print('ReloadMyDic=false', file=f_ini)
print('SentenceDelimiter=,,;。!?', file=f_ini)
print('HandlerList={handler_list}'.format(handler_list=' '.join(handler_list)), file=f_ini)
print('Result=ProbModelResult', file=f_ini)
print('', file=f_ini)
f_ini.flush()
return ini_file, f_ini, options
[docs]def create_parser_ini(*, # pylint: disable=too-many-statements
ws_ini_file,
rule_dir=None,
rdb_dir=None,
do_ws=True,
do_parse=True,
do_role=True,
sentence_delim=',,;。!?',
**options,
):
"""Generate CKIP parser config.
Parameters
----------
rule_dir : str
the path to "Rule/".
rdb_dir : str
the path to "RDB/".
do_ws : bool
do word segmentation.
do_parse : bool
do parsing.
do_role : bool
do role.
sentence_delim : str
the sentence delimiters.
Returns
-------
ini_file : str
the name of the config file.
f_ini : TextIO
the file object.
.. attention::
Remember to close **f_ini** manually.
"""
# pylint: disable=invalid-name
f_ini = _tempfile.NamedTemporaryFile(mode='w')
ini_file = f_ini.name
if rule_dir is None:
rule_dir = _os.getenv('CKIPPARSER_RULE')
if not rule_dir:
rule_dir = _os.path.join(_sys.prefix, 'share', 'ckip_classic', 'Rule')
if not _os.path.isdir(rule_dir):
_warnings.warn('Invalid rule_dir (%s)' % rule_dir)
rule_dir = 'Rule'
if rdb_dir is None:
rdb_dir = _os.getenv('CKIPPARSER_RDB')
if not rdb_dir:
rdb_dir = _os.path.join(_sys.prefix, 'share', 'ckip_classic', 'RDB')
if not _os.path.isdir(rdb_dir):
_warnings.warn('Invalid rdb_dir (%s)' % rdb_dir)
rdb_dir = 'RDB'
is_tag = not do_ws
assign_role = do_role
assign_role_only = False
if not do_parse:
if not do_ws and not do_role:
raise ValueError('Must select at least one task')
if do_ws and not do_role:
raise ValueError('Use ckipws.CkipWs for word segmentation')
if not do_ws and do_role:
assign_role_only = True
if do_ws and do_role:
raise ValueError('Invalid tasks')
print(';PyCkip {version}'.format(version=_about.__version__), file=f_ini)
print(';parser.ini', file=f_ini)
print(';Auto-generated {date}'.format(date=_datetime.datetime.now()), file=f_ini)
print('', file=f_ini)
print('[WordSeg]', file=f_ini)
print('ini={ws_ini_file}'.format(ws_ini_file=ws_ini_file), file=f_ini)
print('', file=f_ini)
print('[Parser]', file=f_ini)
print('SetPos13=0', file=f_ini)
print('13CateFile={rule_dir}/13Cate.txt'.format(rule_dir=rule_dir), file=f_ini)
print('', file=f_ini)
print('SetMap=1', file=f_ini)
print('CatMapFile={rule_dir}/CatMap.txt'.format(rule_dir=rule_dir), file=f_ini)
print('WordLib1={rule_dir}/WordLib1.txt'.format(rule_dir=rule_dir), file=f_ini)
print('WordLib2={rule_dir}/WordLib2.txt'.format(rule_dir=rule_dir), file=f_ini)
print('WordLib3={rule_dir}/WordLib3.txt'.format(rule_dir=rule_dir), file=f_ini)
print('', file=f_ini)
print('GrammarRule={rule_dir}/CKIP-Rule.txt'.format(rule_dir=rule_dir), file=f_ini)
print('HeadRule={rule_dir}/CKIP-Head.txt'.format(rule_dir=rule_dir), file=f_ini)
print('', file=f_ini)
print('SetChangePos=1', file=f_ini)
print('SentenceDelimiter={SentenceDelimiter}'.format(SentenceDelimiter=sentence_delim), file=f_ini)
print('SetLength=15', file=f_ini)
print('NormalPos=1', file=f_ini)
print('NormalTree=1', file=f_ini)
print('IsTag={is_tag}'.format(is_tag=int(is_tag)), file=f_ini)
print('', file=f_ini)
print('[SRL]', file=f_ini)
print('DataPath={rdb_dir}/'.format(rdb_dir=rdb_dir), file=f_ini)
print('AssignRole={assign_role}'.format(assign_role=int(assign_role)), file=f_ini)
print('AssignRoleOnly={assign_role_only}'.format(assign_role_only=int(assign_role_only)), file=f_ini)
f_ini.flush()
return ini_file, f_ini, options