pyparsingラーニング

# -*- coding: utf-8 -*-

from pyparsing import *

def test():
    ur"""

    >>> import pyparsing
    >>> from pprint import pprint
    >>> def check(func, *args):
    ...     try:
    ...         print func(*args)
    ...     except ParseException:
    ...         print 'except!!'
    
    # バージョンチェック
    >>> pyparsing.__version__
    '1.5.2'
    
    # 特定文字列(Literal)
    >>> pattern = Literal('hoge')
    >>> pattern.parseString('hoge hoge')
    (['hoge'], {})
    >>> check(pattern.parseString, 'foo bar baz')
    except!!
    
    # 大文字小文字無視の特定文字列(CaselessLiteral)
    >>> CaselessLiteral('HOGE').parseString('hoge hoge')
    (['HOGE'], {})

    # 特定単語(Keyword)→Literalは特定単語の前方一致でもmatchする
    >>> Keyword('if').parseString('if')
    (['if'], {})
    
    # 大文字小文字無視の特定単語(Keyword)
    >>> Keyword('if', caseless=True).parseString('IF')
    (['if'], {})

    # 任意文字群による文字列(Word)
    >>> Word(alphas).parseString('abc')
    (['abc'], {})
    >>> Word(nums).parseString('012')
    (['012'], {})
    >>> Word(alphanums).parseString('abc012')
    (['abc012'], {})

    # Wordの逆(CharsNotIn)
    >>> CharsNotIn(nums).parseString('abc')
    (['abc'], {})
    >>> check(CharsNotIn(nums).parseString, '012')
    except!!
    
    # 結合(Combine/And(+))
    >>> Combine(Literal('abc') + Literal('012')).parseString('abc012')
    (['abc012'], {})

    # 先方優先の単一選択(MatchFirst(|))
    >>> (Literal('abc') | Literal('abc012')).parseString('abc012')
    (['abc'], {})

    # 一致幅の一番大きいものを選択(Or(^))
    >>> (Literal('abc') ^ Literal('abc012')).parseString('abc012')
    (['abc012'], {})

    # 1つ以上(OneOrMore):順序厳守
    >>> OneOrMore(Literal('foo') + Literal('bar') + Literal('baz')).parseString('foobarbaz')
    (['foo', 'bar', 'baz'], {})
    
    # 1つ以上(OneOrMore):順不同
    >>> OneOrMore(Literal('foo') | Literal('bar') | Literal('baz')).parseString('barfoobaz')
    (['bar', 'foo', 'baz'], {})
    
    # Combineと合わせてみる
    >>> Combine(OneOrMore(Literal('foo') | Literal('bar') | Literal('baz'))).parseString('barfoobaz')
    (['barfoobaz'], {})

    # 0も可(ZeroOrMore)
    >>> (Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbaz')
    (['foo', 'bar', 'baz'], {})
    >>> Combine(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobaz')
    (['foobaz'], {})
    >>> Combine(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbaz')
    (['foobarbaz'], {})
    >>> Combine(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbarbarbaz')
    (['foobarbarbarbaz'], {})

    # 構文解析上は必要だけど構文木には含めない(Suppress)
    >>> (Literal('foo') + Suppress('bar') + Literal('baz')).parseString('foo bar baz')
    (['foo', 'baz'], {})

    # グループ化(Group)
    >>> Group(Literal('foo') + ZeroOrMore(Literal('bar')) + Literal('baz')).parseString('foobarbarbarbaz')
    ([(['foo', 'bar', 'bar', 'bar', 'baz'], {})], {})

    # マッチするまで読み取る(SkipTo)
    >>> (Literal('<') + SkipTo('>')).parseString('<aaabbbccc>')
    (['<', 'aaabbbccc'], {})
    >>> (Literal('<') + SkipTo('>', include=True)).parseString('<aaabbbccc>')
    (['<', (['aaabbbccc', '>'], {})], {})

    # マッチしないことを保証する(NotAny(~))
    >>> (~Literal('012') + Word(nums)).parseString('02468')
    (['02468'], {})
    >>> check((~Literal('012') + Word(nums)).parseString, '01234')
    except!!

    # マッチすることを保証する(FollowedBy)
    >>> token = FollowedBy('abc') + Word(alphas)
    >>> token.parseString('abcdef')
    (['abcdef'], {})
    >>> check(token.parseString, 'bcdef')
    except!!
    >>> (Literal('abc').suppress() + Word(alphas)).parseString('abcdef') # これに似ているけど結果が違う
    (['def'], {})
    
    # csvデータ(commaSeparatedList)
    >>> commaSeparatedList.parseString('a,b,c,d,e,f')
    (['a', 'b', 'c', 'd', 'e', 'f'], {})
    >>> delimitedList(Word(alphas) | Word(alphanums)).parseString('abc, 012  , 999  , xyz, 666, 712')
    (['abc', '012', '999', 'xyz', '666', '712'], {})

    # 解析したデータを辞書化(Dict)
    >>> csvdata = '''
    ... abc, 0, 1, 2, 3;
    ... def, 4, 5, 6, 7;
    ... ghi, 8, 9, A, B;'''
    >>> pattern = Word(alphas) + OneOrMore(Suppress(',') + Word(alphanums)) + Suppress(';')
    >>> pattern = Dict(OneOrMore(Group(pattern)))
    >>> pprint(pattern.parseString(csvdata).asList())
    [['abc', '0', '1', '2', '3'],
     ['def', '4', '5', '6', '7'],
     ['ghi', '8', '9', 'A', 'B']]

    # 文字列の先頭(StringStart) / 文字列の末尾(StringEnd)
    >>> (StringStart() + Literal('abc') + StringEnd()).parseString('abc')
    (['abc'], {})
    >>> (StringStart() + Literal('abc')).parseString('abc')
    (['abc'], {})
    >>> check((StringStart() + Literal('abc') + StringEnd()).parseString, 'abc012')
    except!!

    # 文字列(quotedString / dblQuotedString)
    >>> OneOrMore(quotedString | dblQuotedString).parseString(''' "foo" \'bar\' "baz" ''')
    (['"foo"', "'bar'", '"baz"'], {})
    >>> OneOrMore(quotedString | dblQuotedString).parseString(''' "foo""bar"\'baz\' ''')
    (['"foo""bar"', "'baz'"], {})

    # 行末まで(restOfLine)
    >>> OneOrMore(Word('# \t').suppress() + restOfLine).parseString('''
    ... # foo 1000
    ... # bar 2000
    ... # baz 3000
    ... ''')
    (['foo 1000', 'bar 2000', 'baz 3000'], {})

    # makeHTMLTags
    >>> start, end = makeHTMLTags('PRE')
    >>> (start.suppress() + SkipTo(end) + end.suppress()).parseString('<PRE>abc</PRE>')
    (['abc'], {})

    # 再帰評価(Forward)
    >>> expr = Forward()
    >>> expr << ('<' + (expr | Word(alphas)) + '>')
    >>> expr.parseString('<abc>')
    (['<', 'abc', '>'], {})
    >>> expr.parseString('<<abc>>')
    (['<', '<', 'abc', '>', '>'], {})
    >>> expr.parseString('<<<abc>>>')
    (['<', '<', '<', 'abc', '>', '>', '>'], {})

    # (oneOf)
    >>> OneOrMore(oneOf('ab cd ef')).parseString('abcdef')
    (['ab', 'cd', 'ef'], {})

    """

if __name__ == '__main__':
    import doctest
    print doctest.testmod()

職場で使ってるファイルのparseに使おうと思ってるんだけども、ここで晒すわけには行かないので何か手ごろなお試しファイルは無いかな〜。