PyPyは本当に速いのか?

先日のjson速度計測を使いまわして試してみた。
#PyPyではujsonがコンパイル通らずインストール不可でした
速いのはyamlだけ。PyPyはC言語でモジュールを書いた部分が混ざると逆に遅くなるとか?
PyPy専用に書けば速くなるのかもしれないけど、すべてのモジュールを書き直すのも無駄なのでPyPyを使うとしたら標準モジュールでやりきれるとこまでかな。実装しなおすならRPythonかせめてCythonで書く方が有益な気がします。
この前のとは動作速度がえらい違うのはへちょいノートPCで動かしたせいです。

サンプルデータ出力

import os
import json
import gzip
import time
import yaml
import msgpack
import bson

dic = {}
for rootdir, dirs, files in os.walk('/home/shive'):
    rootdir = os.path.abspath(rootdir)
    node = {}
    for name in files:
        path = os.path.join(rootdir, name)
        node[name] = {
            'name': name,
            'size': int(os.path.getsize(path)),
            'mtime': time.ctime(os.path.getmtime(path)),
            'ctime': time.ctime(os.path.getctime(path)),
            }
    dic[rootdir] = node
    dirs[:] = [s for s in dirs if s[0] != '.']

with gzip.open('data.json.gz', 'wb') as fp:
    json.dump(dic, fp, ensure_ascii=False)

with gzip.open('data.yaml.gz', 'wb') as fp:
    yaml.safe_dump(dic, fp)

with gzip.open('data.mpac.gz', 'wb') as fp:
    msgpack.dump(dic, fp)

with gzip.open('data.bson.gz', 'wb') as fp:
    fp.write(bson.BSON.encode(dic))

計測コードは前回とほぼ同じ

import sys
import gzip
import timeit
import json
import yaml
import bson
import pymongo
import msgpack
try:
    import ujson
except ImportError:
    ujson = None

#=======================================================================================================================
def json_loads(dump):
    return json.loads(dump)

#=======================================================================================================================
def ujson_loads(dump):
    return ujson.loads(dump)

#=======================================================================================================================
def yaml_loads(dump):
    return yaml.load(dump)

#=======================================================================================================================
def cyaml_loads(dump):
    return yaml.load(dump, Loader=yaml.CLoader)

#=======================================================================================================================
def bson_loads(dump):
    return bson.BSON(dump).decode()

#=======================================================================================================================
def msgpack_loads(dump):
    return msgpack.loads(dump)

#=======================================================================================================================
def test_load():
    """load時間計測"""

    print '[load]'
    dump = gzip.open('data.json.gz', 'rb').read()
    print 'json.loads: %.5f sec' % (timeit.timeit(stmt=lambda : json_loads(dump), number=40) / 40)
    dump = gzip.open('data.yaml.gz', 'rb').read()
    print 'yaml.load: %.5f sec' % (timeit.timeit(stmt=lambda : yaml_loads(dump), number=1) / 1)
    if yaml.__with_libyaml__:
        dump = gzip.open('data.yaml.gz', 'rb').read()
        print 'yaml.load(libyaml): %.5f sec' % (timeit.timeit(stmt=lambda : cyaml_loads(dump), number=1) / 1)
    else:
        print 'yaml.load(libyaml):', '-'
    dump = gzip.open('data.bson.gz', 'rb').read()
    print 'bson.BSON.decode: %.5f sec' % (timeit.timeit(stmt=lambda : bson_loads(dump), number=110) / 110)
    dump = gzip.open('data.json.gz', 'rb').read()
    if ujson:
        print 'ujson.loads: %.5f sec' % (timeit.timeit(stmt=lambda : ujson_loads(dump), number=100) / 100)
    else:
        print 'ujson.loads: -'
    dump = gzip.open('data.mpac.gz', 'rb').read()
    print 'msgpack.loads: %.5f sec' % (timeit.timeit(stmt=lambda : msgpack_loads(dump), number=250) / 250)
    print


#=======================================================================================================================
def json_dumps(data):
    return json.dumps(data)

#=======================================================================================================================
def ujson_dumps(data):
    return ujson.dumps(data)

#=======================================================================================================================
def yaml_dumps(data):
    return yaml.dump(data)

#=======================================================================================================================
def cyaml_dumps(data):
    return yaml.dump(data, Dumper=yaml.CDumper)

#=======================================================================================================================
def bson_dumps(data):
    return bson.BSON.encode(data)

#=======================================================================================================================
def msgpack_dumps(data):
    return msgpack.dumps(data)

#=======================================================================================================================
def test_dump():
    """dump時間計測"""

    dump = gzip.open('data.mpac.gz', 'rb').read()
    data = msgpack.loads(dump)

    print '[dump]'
    print 'json.dumps: %.5f sec' % (timeit.timeit(stmt=lambda : json_dumps(data), number=75) / 75)
    print 'yaml.dump: %.5f sec' % (timeit.timeit(stmt=lambda : yaml_dumps(data), number=1) / 1)
    if yaml.__with_libyaml__:
        print 'yaml.dump(libyaml): %.5f sec' % (timeit.timeit(stmt=lambda : cyaml_dumps(data), number=1) / 1)
    else:
        print 'yaml.dump(libyaml):', '-'
    print 'bson.BSON.encode: %.5f sec' % (timeit.timeit(stmt=lambda : bson_dumps(data), number=18) / 18)
    if ujson:
        print 'ujson.dumps: %.5f sec' % (timeit.timeit(stmt=lambda : ujson_dumps(data), number=116) / 116)
    else:
        print 'ujson.dumps: -'
    print 'msgpack.dumps: %.5f sec' % (timeit.timeit(stmt=lambda : msgpack.dumps(data), number=166) / 166)
    print


#=======================================================================================================================
def validationcheck():
    """dump/loadがちゃんと可逆になっているかの動作確認"""

    print '[data]'

    dump = gzip.open('data.json.gz', 'rb').read()
    dump = json_dumps(json_loads(dump))
    dump_len = len(dump)
    print 'data.json:', dump_len, 'bytes'
    dump = json_dumps(json_loads(dump))
    assert len(dump) == dump_len, len(dump)

    if ujson:
        dump = ujson_dumps(ujson_loads(dump)) ### jsonとujsonとで出力の仕方が微妙に違うので再dumpしてからチェック
        dump_len = len(dump)
        print 'data.json:', dump_len, 'bytes', '(ujson)'
        dump = ujson_dumps(ujson_loads(dump))
        assert len(dump) == dump_len, len(dump)
    else:
        print 'data.json: - (ujson)'

    dump = gzip.open('data.yaml.gz', 'rb').read()
    dump = yaml_dumps(yaml_loads(dump))
    dump_len = len(dump)
    print 'data.yaml:', dump_len, 'bytes'
    dump = yaml_dumps(yaml_loads(dump))
    assert len(dump) == dump_len

    if yaml.__with_libyaml__:
        dump = cyaml_dumps(cyaml_loads(dump))
        dump_len = len(dump)
        print 'data.yaml:', dump_len, 'bytes', '(libyaml)'
        dump = cyaml_dumps(cyaml_loads(dump))
        assert len(dump) == dump_len
    else:
        print 'data.yaml: - (libyaml)'

    dump = gzip.open('data.bson.gz', 'rb').read()
    dump_len = len(dump)
    print 'data.bson:', dump_len, 'bytes'
    dump = bson_dumps(bson_loads(dump))
    assert len(dump) == dump_len

    dump = gzip.open('data.mpac.gz', 'rb').read()
    dump_len = len(dump)
    print 'data.mpac:', dump_len, 'bytes'
    dump = msgpack_dumps(msgpack_loads(dump))
    assert len(dump) == dump_len

    print

#=======================================================================================================================
def version():
    print '[version]'
    print 'sys.version:', sys.version
    print 'json.__version__:', json.__version__
    print 'yaml.__version__:', yaml.__version__, 'libyaml=' + str(yaml.__with_libyaml__)
    print 'pymongo.version(bson):', pymongo.version
    if ujson:
        print 'ujson.__version__:', ujson.__version__
    else:
        print 'ujson.__version__: -'
    print 'msgpack.version:', msgpack.version
    print

#=======================================================================================================================
def main():

    version()
    validationcheck()
    test_load()
    test_dump()

    return 0

#=======================================================================================================================
if __name__ == '__main__':
    sys.exit(main() or 0)

CPythonで実行

[version]
sys.version: 2.7.3 (default, Aug  1 2012, 05:16:07) 
[GCC 4.6.3]
json.__version__: 2.0.9
yaml.__version__: 3.10 libyaml=False
pymongo.version(bson): 2.4.2
ujson.__version__: 1.30
msgpack.version: (0, 3, 0)

[data]
data.json: 2719381 bytes
data.json: 2563594 bytes (ujson)
data.yaml: 2594612 bytes
data.yaml: - (libyaml)
data.bson: 2610551 bytes
data.mpac: 2180995 bytes

[load]
json.loads: 0.27414 sec
yaml.load: 55.24576 sec
yaml.load(libyaml): -
bson.BSON.decode: 0.11302 sec
ujson.loads: 0.11755 sec
msgpack.loads: 0.06876 sec

[dump]
json.dumps: 0.15120 sec
yaml.dump: 36.16442 sec
yaml.dump(libyaml): -
bson.BSON.encode: 0.19599 sec
ujson.dumps: 0.09113 sec
msgpack.dumps: 0.07143 sec

PyPyで実行

[version]
sys.version: 2.7.2 (341e1e3821ff, Jun 07 2012, 15:40:31)
[PyPy 1.9.0 with GCC 4.4.3]
json.__version__: 2.0.9
yaml.__version__: 3.10 libyaml=False
pymongo.version(bson): 2.4.2
ujson.__version__: -
msgpack.version: (0, 3, 0)

[data]
data.json: 2719381 bytes
data.json: - (ujson)
data.yaml: 2594612 bytes
data.yaml: - (libyaml)
data.bson: 2610551 bytes
data.mpac: 2180995 bytes

[load]
json.loads: 0.60501 sec
yaml.load: 8.02591 sec
yaml.load(libyaml): -
bson.BSON.decode: 0.38399 sec
ujson.loads: -
msgpack.loads: 0.26650 sec

[dump]
json.dumps: 0.26044 sec
yaml.dump: 8.60209 sec
yaml.dump(libyaml): -
bson.BSON.encode: 0.25331 sec
ujson.dumps: -
msgpack.dumps: 0.13120 sec