pyparsingラーニング
# -*- coding: utf-8 -*- from pyparsing import * def test(): ur""" >>> import pyparsing >>> from pprint import pprint >>> def check(func, *args): ... try: ... print func(*args) ... except ParseException: ... print 'except!!' # バージョンチェック >>> pyparsing.__version__ '1.5.2' # 特定文字列(Literal) >>> pattern = Literal('hoge') >>> pattern.parseString('hoge hoge') (['hoge'], {}) >>> check(pattern.parseString, 'foo bar baz') except!! # 大文字小文字無視の特定文字列(CaselessLiteral) >>> CaselessLiteral('HOGE').parseString('hoge hoge') (['HOGE'], {}) # 特定単語(Keyword)→Literalは特定単語の前方一致でもmatchする >>> Keyword('if').parseString('if') (['if'], {}) # 大文字小文字無視の特定単語(Keyword) >>> Keyword('if', caseless=True).parseString('IF') (['if'], {}) # 任意文字群による文字列(Word) >>> Word(alphas).parseString('abc') (['abc'], {}) >>> Word(nums).parseString('012') (['012'], {}) >>> Word(alphanums).parseString('abc012') (['abc012'], {}) # Wordの逆(CharsNotIn) >>> CharsNotIn(nums).parseString('abc') (['abc'], {}) >>> check(CharsNotIn(nums).parseString, '012') except!! # 結合(Combine/And(+)) >>> Combine(Literal('abc') + Literal('012')).parseString('abc012') (['abc012'], {}) # 先方優先の単一選択(MatchFirst(|)) >>> (Literal('abc') | Literal('abc012')).parseString('abc012') (['abc'], {}) # 一致幅の一番大きいものを選択(Or(^)) >>> (Literal('abc') ^ Literal('abc012')).parseString('abc012') (['abc012'], {}) # 1つ以上(OneOrMore):順序厳守 >>> OneOrMore(Literal('foo') + Literal('bar') + Literal('baz')).parseString('foobarbaz') (['foo', 'bar', 'baz'], {}) # 1つ以上(OneOrMore):順不同 >>> OneOrMore(Literal('foo') | Literal('bar') | Literal('baz')).parseString('barfoobaz') (['bar', 'foo', 'baz'], {}) # Combineと合わせてみる >>> Combine(OneOrMore(Literal('foo') | Literal('bar') | Literal('baz'))).parseString('barfoobaz') (['barfoobaz'], {}) # 0も可(ZeroOrMore) >>> (Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbaz') (['foo', 'bar', 'baz'], {}) >>> Combine(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobaz') (['foobaz'], {}) >>> Combine(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbaz') (['foobarbaz'], {}) >>> Combine(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbarbarbaz') (['foobarbarbarbaz'], {}) # 構文解析上は必要だけど構文木には含めない(Suppress) >>> (Literal('foo') + Suppress('bar') + Literal('baz')).parseString('foo bar baz') (['foo', 'baz'], {}) # グループ化(Group) >>> Group(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbarbarbaz') ([(['foo', 'bar', 'bar', 'bar', 'baz'], {})], {}) # マッチするまで読み取る(SkipTo) >>> (Literal('<') + SkipTo('>')).parseString('<aaabbbccc>') (['<', 'aaabbbccc'], {}) >>> (Literal('<') + SkipTo('>', include=True)).parseString('<aaabbbccc>') (['<', (['aaabbbccc', '>'], {})], {}) # マッチしないことを保証する(NotAny(~)) >>> (~Literal('012') + Word(nums)).parseString('02468') (['02468'], {}) >>> check((~Literal('012') + Word(nums)).parseString, '01234') except!! # マッチすることを保証する(FollowedBy) >>> token = FollowedBy('abc') + Word(alphas) >>> token.parseString('abcdef') (['abcdef'], {}) >>> check(token.parseString, 'bcdef') except!! >>> (Literal('abc').suppress() + Word(alphas)).parseString('abcdef') # これに似ているけど結果が違う (['def'], {}) # csvデータ(commaSeparatedList) >>> commaSeparatedList.parseString('a,b,c,d,e,f') (['a', 'b', 'c', 'd', 'e', 'f'], {}) >>> delimitedList(Word(alphas) | Word(alphanums)).parseString('abc, 012 , 999 , xyz, 666, 712') (['abc', '012', '999', 'xyz', '666', '712'], {}) # 解析したデータを辞書化(Dict) >>> csvdata = ''' ... abc, 0, 1, 2, 3; ... def, 4, 5, 6, 7; ... ghi, 8, 9, A, B;''' >>> pattern = Word(alphas) + OneOrMore(Suppress(',') + Word(alphanums)) + Suppress(';') >>> pattern = Dict(OneOrMore(Group(pattern))) >>> pprint(pattern.parseString(csvdata).asList()) [['abc', '0', '1', '2', '3'], ['def', '4', '5', '6', '7'], ['ghi', '8', '9', 'A', 'B']] # 文字列の先頭(StringStart) / 文字列の末尾(StringEnd) >>> (StringStart() + Literal('abc') + StringEnd()).parseString('abc') (['abc'], {}) >>> (StringStart() + Literal('abc')).parseString('abc') (['abc'], {}) >>> check((StringStart() + Literal('abc') + StringEnd()).parseString, 'abc012') except!! # 文字列(quotedString / dblQuotedString) >>> OneOrMore(quotedString | dblQuotedString).parseString(''' "foo" \'bar\' "baz" ''') (['"foo"', "'bar'", '"baz"'], {}) >>> OneOrMore(quotedString | dblQuotedString).parseString(''' "foo""bar"\'baz\' ''') (['"foo""bar"', "'baz'"], {}) # 行末まで(restOfLine) >>> OneOrMore(Word('# \t').suppress() + restOfLine).parseString(''' ... # foo 1000 ... # bar 2000 ... # baz 3000 ... ''') (['foo 1000', 'bar 2000', 'baz 3000'], {}) # makeHTMLTags >>> start, end = makeHTMLTags('PRE') >>> (start.suppress() + SkipTo(end) + end.suppress()).parseString('<PRE>abc</PRE>') (['abc'], {}) # 再帰評価(Forward) >>> expr = Forward() >>> expr << ('<' + (expr | Word(alphas)) + '>') >>> expr.parseString('<abc>') (['<', 'abc', '>'], {}) >>> expr.parseString('<<abc>>') (['<', '<', 'abc', '>', '>'], {}) >>> expr.parseString('<<<abc>>>') (['<', '<', '<', 'abc', '>', '>', '>'], {}) # (oneOf) >>> OneOrMore(oneOf('ab cd ef')).parseString('abcdef') (['ab', 'cd', 'ef'], {}) """ if __name__ == '__main__': import doctest print doctest.testmod()
職場で使ってるファイルのparseに使おうと思ってるんだけども、ここで晒すわけには行かないので何か手ごろなお試しファイルは無いかな〜。