Try   HackMD

profiling in python

tags: python

Hunting Performance in Python Code

from pstats import Stats
from random import randint
import cProfile
import profile
import tracemalloc  # python3 only


def insert_sort(data):
    result = []
    for value in data:
        insert_value(result, value)
    return result


def insert_value(array, value):
    for i, existing in enumerate(array):
        if existing > value:
            array.insert(i, value)
            return
    array.append(value)


maxsize = 10**4
data = [randint(0, maxsize) for _ in range(0, maxsize)]  # prepare data

# start profileing
prof = profile.Profile()
prof.runcall(insert_sort, data)

# profileing stats information
stat = Stats(prof)
stat.strip_dirs()
stat.sort_stats('ncalls')
stat.print_stats()

# print which function call other function times
stat.print_callers()

# start tracemalloc
tracemalloc.start()
insert_sort(data)
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')

print("[Top 10]")
for stat in top_stats[:10]:
    print(stat)
​​​​         20003 function calls in 1.461 seconds
​​​​
​​​​   Ordered by: call count
​​​​
​​​​   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
​​​​    10000    1.424    0.000    1.448    0.000 <ipython-input-1-ca319fadb962>:15(insert_value)
​​​​     9992    0.024    0.000    0.024    0.000 :0(insert)
​​​​        8    0.000    0.000    0.000    0.000 :0(append)
​​​​        1    0.013    0.013    1.461    1.461 <ipython-input-1-ca319fadb962>:8(insert_sort)
​​​​        1    0.000    0.000    0.000    0.000 :0(setprofile)
​​​​        1    0.000    0.000    1.461    1.461 profile:0(<function insert_sort at 0x7f9490665510>)
​​​​        0    0.000             0.000          profile:0(profiler)
​​​​
​​​​
​​​​   Ordered by: call count
​​​​
​​​​Function                                             was called by...
​​​​<ipython-input-1-ca319fadb962>:15(insert_value)      <- <ipython-input-1-ca319fadb962>:8(insert_sort)(10000)    1.461
​​​​:0(insert)                                           <- <ipython-input-1-ca319fadb962>:15(insert_value)(9992)    1.448
​​​​:0(append)                                           <- <ipython-input-1-ca319fadb962>:15(insert_value)(8)    1.448
​​​​<ipython-input-1-ca319fadb962>:8(insert_sort)        <- profile:0(<function insert_sort at 0x7f9490665510>)(1)    1.461
​​​​:0(setprofile)                                       <- profile:0(<function insert_sort at 0x7f9490665510>)(1)    1.461
​​​​profile:0(<function insert_sort at 0x7f9490665510>)  <- profile:0(profiler)(1)    0.000
​​​​profile:0(profiler)                                  <- 
​​​​
​​​​
​​​​[Top 10]
​​​​/usr/lib/python3.5/codeop.py:133: size=226 B, count=3, average=75 B
​​​​/usr/local/lib/python3.5/dist-packages/zmq/sugar/poll.py:99: size=224 B, count=1, average=224 B
​​​​/usr/local/lib/python3.5/dist-packages/ipykernel/iostream.py:383: size=152 B, count=2, average=76 B
​​​​/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2800: size=96 B, count=3, average=32 B
​​​​/usr/local/lib/python3.5/dist-packages/simplejson/encoder.py:350: size=64 B, count=1, average=64 B
​​​​/usr/local/lib/python3.5/dist-packages/simplejson/encoder.py:225: size=64 B, count=1, average=64 B
​​​​/usr/local/lib/python3.5/dist-packages/ipykernel/iostream.py:352: size=64 B, count=1, average=64 B
​​​​/usr/local/lib/python3.5/dist-packages/traitlets/traitlets.py:600: size=62 B, count=1, average=62 B
​​​​/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py:863: size=48 B, count=1, average=48 B
​​​​/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py:123: size=32 B, count=1, average=32 B
import time
import statistics
from functools import wraps

def benchmark(fun,times=10,result_enable=False):
    @wraps(fun)
    def benchmark_function(*args,**kwargs):
        print('current fun:',fun.__name__)
        print('arguments"',args)
        print('key arguments:',kwargs)
        results = []
        gstart = time.time()
        for _ in range(times):
            start = time.time()
            count = len(fun(*args,**kwargs))
            end = time.time()
            results.append(end-start)
        gend = time.time()
        mean = statistics.mean(results)
        stdev = statistics.stdev(results)
        perc = (stdev * 100)/ mean
        print ("Benchmark duration: {} seconds".format(gend-gstart))
        print ("Mean duration: {} seconds".format(mean))
        print ("Standard deviation: {} ({} %)".format(stdev, perc))
        return (gend-gstart,mean,stdev,perc)
    return benchmark_function
@benchmark
def primes(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    return [2]+[x for x in s if x]
primes(1000000)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.3134047985076904 seconds
​​​​Mean duration: 0.1313349962234497 seconds
​​​​Standard deviation: 0.026443051577771697 (20.13404830254247 %)





​​​​(1.3134047985076904,
​​​​ 0.1313349962234497,
​​​​ 0.026443051577771697,
​​​​ 20.13404830254247)
!cpupower frequency-info
​​​​analyzing CPU 0:
​​​​  driver: intel_pstate
​​​​  CPUs which run at the same hardware frequency: 0
​​​​  CPUs which need to have their frequency coordinated by software: 0
​​​​  maximum transition latency:  Cannot determine or is not supported.
​​​​  hardware limits: 400 MHz - 3.10 GHz
​​​​  available cpufreq governors: performance powersave
​​​​  current policy: frequency should be within 3.10 GHz and 3.10 GHz.
​​​​                  The governor "performance" may decide which speed to use
​​​​                  within this range.
​​​​  current CPU frequency: Unable to call hardware
​​​​  current CPU frequency: 3.10 GHz (asserted by call to kernel)
​​​​  boost state support:
​​​​    Supported: yes
​​​​    Active: yes
  • performance – run the CPU at the maximum frequency.
  • powersave – run the CPU at the minimum frequency.
  • userspace – run the CPU at user specified frequencies.
  • ondemand – scales the frequency dynamically according to current load. Jumps to the highest frequency and then possibly back off as the idle time increases.
  • conservative – scales the frequency dynamically according to current load. Scales the frequency more gradually than ondemand.

change the cpu performance

cpupower frequency-set -g performance
cpupower frequency-set --min 3100000 --max 3100000
!cpupower frequency-info
​​​​analyzing CPU 0:
​​​​  driver: intel_pstate
​​​​  CPUs which run at the same hardware frequency: 0
​​​​  CPUs which need to have their frequency coordinated by software: 0
​​​​  maximum transition latency:  Cannot determine or is not supported.
​​​​  hardware limits: 400 MHz - 3.10 GHz
​​​​  available cpufreq governors: performance powersave
​​​​  current policy: frequency should be within 3.10 GHz and 3.10 GHz.
​​​​                  The governor "performance" may decide which speed to use
​​​​                  within this range.
​​​​  current CPU frequency: Unable to call hardware
​​​​  current CPU frequency: 3.10 GHz (asserted by call to kernel)
​​​​  boost state support:
​​​​    Supported: yes
​​​​    Active: yes

The power governor has a great impact on how a CPU is used. By default, the governor is set to automatically scale the frequency to reduce the power consumption. We do not want this on our system and we proceed by disabling it from GRUB. Just edit /boot/grub/grub.cfg (but if you do be careful that on a kernel upgrade, this will be gone) or create a new kernel entry in /etc/grub.d/40_custom. Our boot line must contain the following flag: intel_pstate=disable, like this:

linux   /boot/vmlinuz-4.4.0-78-generic.efi.signed root=UUID=86097ec1-3fa4-4d00-97c7-3bf91787be83 ro intel_pstate=disable quiet splash $vt_handoff

ASLR (Address Space Layout Randomizer)

close aslr

# disable
sudo bash -c 'echo 0 >| /proc/sys/kernel/randomize_va_space'
# enable
sudo bash -c 'echo 2 >| /proc/sys/kernel/randomize_va_space'
%%prun 
primes(1000000)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.2920074462890625 seconds
​​​​Mean duration: 0.12919511795043945 seconds
​​​​Standard deviation: 0.006949116079384523 (5.378776063388304 %)
%%time 
primes(10000000)
​​​​current fun: primes
​​​​arguments" (10000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 15.899296283721924 seconds
​​​​Mean duration: 1.5899237871170044 seconds
​​​​Standard deviation: 0.08453717221379202 (5.317058144471351 %)
​​​​CPU times: user 15.7 s, sys: 156 ms, total: 15.9 s
​​​​Wall time: 15.9 s





​​​​(15.899296283721924,
​​​​ 1.5899237871170044,
​​​​ 0.08453717221379202,
​​​​ 5.317058144471351)
%%timeit
primes(1000000)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.3215422630310059 seconds
​​​​Mean duration: 0.1321493625640869 seconds
​​​​Standard deviation: 0.0034519213628097308 (2.6121362190723345 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.3617463111877441 seconds
​​​​Mean duration: 0.13617000579833985 seconds
​​​​Standard deviation: 0.008070226512751822 (5.926581603222794 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.4243576526641846 seconds
​​​​Mean duration: 0.14243073463439943 seconds
​​​​Standard deviation: 0.010772609218486329 (7.563402130964339 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.5485849380493164 seconds
​​​​Mean duration: 0.1548527717590332 seconds
​​​​Standard deviation: 0.00691087803664961 (4.4628700914722055 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.351806402206421 seconds
​​​​Mean duration: 0.13517611026763915 seconds
​​​​Standard deviation: 0.009168839147290404 (6.782884290084062 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.2886924743652344 seconds
​​​​Mean duration: 0.12886531352996827 seconds
​​​​Standard deviation: 0.0032479093750779576 (2.5203906979379984 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.3597874641418457 seconds
​​​​Mean duration: 0.13597421646118163 seconds
​​​​Standard deviation: 0.01708626966270827 (12.565815863764227 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 1.395094871520996 seconds
​​​​Mean duration: 0.13950414657592775 seconds
​​​​Standard deviation: 0.01917858677169858 (13.747682231983173 %)
​​​​1.39 s ± 75.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%%writefile test_primes.py
import time
import statistics
from functools import wraps

def benchmark(fun,times=10,result_enable=False):
    @wraps(fun)
    def benchmark_function(*args,**kwargs):
        print('current fun:',fun.__name__)
        print('arguments"',args)
        print('key arguments:',kwargs)
        results = []
        gstart = time.time()
        for _ in range(times):
            start = time.time()
            count = len(fun(*args,**kwargs))
            end = time.time()
            results.append(end-start)
        gend = time.time()
        mean = statistics.mean(results)
        stdev = statistics.stdev(results)
        perc = (stdev * 100)/ mean
        print ("Benchmark duration: {} seconds".format(gend-gstart))
        print ("Mean duration: {} seconds".format(mean))
        print ("Standard deviation: {} ({} %)".format(stdev, perc))
        return (gend-gstart,mean,stdev,perc)
    return benchmark_function

@benchmark
def primes(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    return [2]+[x for x in s if x]

primes(1000000)
​​​​Writing test_primes.py
!pypy -V
​​​​Python 3.5.3 (2875f328eae2, Apr 02 2017, 17:58:49)
​​​​[PyPy 5.7.1-beta0 with GCC 6.2.0 20160901]
!pypy test_primes.py
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.13382816314697266 seconds
​​​​Mean duration: 0.013373184204101562 seconds
​​​​Standard deviation: 0.0024842894560805 (18.576648748460123 %)
%%timeit
!pypy test_primes.py
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.1339244842529297 seconds
​​​​Mean duration: 0.013384270668029784 seconds
​​​​Standard deviation: 0.003059768676200482 (22.86092946034908 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.12753796577453613 seconds
​​​​Mean duration: 0.01274564266204834 seconds
​​​​Standard deviation: 0.0021554952747125347 (16.911624873422642 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.12777066230773926 seconds
​​​​Mean duration: 0.012769460678100586 seconds
​​​​Standard deviation: 0.002132414121722897 (16.69932799417247 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.1306934356689453 seconds
​​​​Mean duration: 0.013060450553894043 seconds
​​​​Standard deviation: 0.002033298816640572 (15.56836656017455 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.13000965118408203 seconds
​​​​Mean duration: 0.012992572784423829 seconds
​​​​Standard deviation: 0.002555783370927481 (19.671110667099644 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.12881755828857422 seconds
​​​​Mean duration: 0.012873315811157226 seconds
​​​​Standard deviation: 0.002499031533282555 (19.412493019993025 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.13097691535949707 seconds
​​​​Mean duration: 0.0130889892578125 seconds
​​​​Standard deviation: 0.002260982415565542 (17.273926741257096 %)
​​​​current fun: primes
​​​​arguments" (1000000,)
​​​​key arguments: {}
​​​​Benchmark duration: 0.1263427734375 seconds
​​​​Mean duration: 0.01262507438659668 seconds
​​​​Standard deviation: 0.0020692545801631254 (16.390038718187157 %)
​​​​333 ms ± 9.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Measuring Memory Consumption

pip install memory_profiler
pip install psutil
from memory_profiler import profile
%%writefile primes1.py
from memory_profiler import profile


@profile(precision=6)
def primes1(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    return [2]+[x for x in s if x]

len(primes1(100000))
​​​​Overwriting primes1.py
%run primes1.py
​​​​Filename: /home/splasky/juypter-notebook/primes1.py
​​​​
​​​​Line #    Mem usage    Increment   Line Contents
​​​​================================================
​​​​     4  49.328125 MiB   0.000000 MiB   @profile(precision=6)
​​​​     5                             def primes1(n): 
​​​​     6  49.328125 MiB   0.000000 MiB       if n==2:
​​​​     7                                     return [2]
​​​​     8  49.328125 MiB   0.000000 MiB       elif n<2:
​​​​     9                                     return []
​​​​    10  51.039062 MiB   1.710938 MiB       s=list(range(3,n+1,2))
​​​​    11  51.039062 MiB   0.000000 MiB       mroot = n ** 0.5
​​​​    12  51.039062 MiB   0.000000 MiB       half=(n+1)//2-1
​​​​    13  51.039062 MiB   0.000000 MiB       i=0
​​​​    14  51.039062 MiB   0.000000 MiB       m=3
​​​​    15  51.039062 MiB   0.000000 MiB       while m <= mroot:
​​​​    16  51.039062 MiB   0.000000 MiB           if s[i]:
​​​​    17  51.039062 MiB   0.000000 MiB               j=(m*m-3)//2
​​​​    18  51.039062 MiB   0.000000 MiB               s[j]=0
​​​​    19  51.039062 MiB   0.000000 MiB               while j<half:
​​​​    20  51.039062 MiB   0.000000 MiB                   s[j]=0
​​​​    21  51.039062 MiB   0.000000 MiB                   j+=m
​​​​    22  51.039062 MiB   0.000000 MiB           i=i+1
​​​​    23  51.039062 MiB   0.000000 MiB           m=2*i+3
​​​​    24  51.296875 MiB   0.257812 MiB       return [2]+[x for x in s if x]

using pypy

pypy -m memory_profiler primes1.py
%%writefile primes2.py
# improve by range call
from memory_profiler import profile


@profile(precision=6)
def primes2(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    l = [2]
    for x in s:
        if x:
            l.append(x)
    return l


len(primes2(100000))
​​​​Overwriting primes2.py

Visual Studio Code - Python debugging - How to step into external functions/packages?

"justMyCode": false