profiling in python

tags: `python`

Hunting Performance in Python Code

from pstats import Stats
from random import randint
import cProfile
import profile
import tracemalloc  # python3 only


def insert_sort(data):
    result = []
    for value in data:
        insert_value(result, value)
    return result


def insert_value(array, value):
    for i, existing in enumerate(array):
        if existing > value:
            array.insert(i, value)
            return
    array.append(value)


maxsize = 10**4
data = [randint(0, maxsize) for _ in range(0, maxsize)]  # prepare data

# start profileing
prof = profile.Profile()
prof.runcall(insert_sort, data)

# profileing stats information
stat = Stats(prof)
stat.strip_dirs()
stat.sort_stats('ncalls')
stat.print_stats()

# print which function call other function times
stat.print_callers()

# start tracemalloc
tracemalloc.start()
insert_sort(data)
snapshot = tracemalloc.take_snapshot()
top_stats = snapshot.statistics('lineno')

print("[Top 10]")
for stat in top_stats[:10]:
    print(stat)

         20003 function calls in 1.461 seconds

   Ordered by: call count

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
    10000    1.424    0.000    1.448    0.000 <ipython-input-1-ca319fadb962>:15(insert_value)
     9992    0.024    0.000    0.024    0.000 :0(insert)
        8    0.000    0.000    0.000    0.000 :0(append)
        1    0.013    0.013    1.461    1.461 <ipython-input-1-ca319fadb962>:8(insert_sort)
        1    0.000    0.000    0.000    0.000 :0(setprofile)
        1    0.000    0.000    1.461    1.461 profile:0(<function insert_sort at 0x7f9490665510>)
        0    0.000             0.000          profile:0(profiler)


   Ordered by: call count

Function                                             was called by...
<ipython-input-1-ca319fadb962>:15(insert_value)      <- <ipython-input-1-ca319fadb962>:8(insert_sort)(10000)    1.461
:0(insert)                                           <- <ipython-input-1-ca319fadb962>:15(insert_value)(9992)    1.448
:0(append)                                           <- <ipython-input-1-ca319fadb962>:15(insert_value)(8)    1.448
<ipython-input-1-ca319fadb962>:8(insert_sort)        <- profile:0(<function insert_sort at 0x7f9490665510>)(1)    1.461
:0(setprofile)                                       <- profile:0(<function insert_sort at 0x7f9490665510>)(1)    1.461
profile:0(<function insert_sort at 0x7f9490665510>)  <- profile:0(profiler)(1)    0.000
profile:0(profiler)                                  <- 


[Top 10]
/usr/lib/python3.5/codeop.py:133: size=226 B, count=3, average=75 B
/usr/local/lib/python3.5/dist-packages/zmq/sugar/poll.py:99: size=224 B, count=1, average=224 B
/usr/local/lib/python3.5/dist-packages/ipykernel/iostream.py:383: size=152 B, count=2, average=76 B
/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py:2800: size=96 B, count=3, average=32 B
/usr/local/lib/python3.5/dist-packages/simplejson/encoder.py:350: size=64 B, count=1, average=64 B
/usr/local/lib/python3.5/dist-packages/simplejson/encoder.py:225: size=64 B, count=1, average=64 B
/usr/local/lib/python3.5/dist-packages/ipykernel/iostream.py:352: size=64 B, count=1, average=64 B
/usr/local/lib/python3.5/dist-packages/traitlets/traitlets.py:600: size=62 B, count=1, average=62 B
/usr/local/lib/python3.5/dist-packages/tornado/ioloop.py:863: size=48 B, count=1, average=48 B
/usr/local/lib/python3.5/dist-packages/zmq/eventloop/ioloop.py:123: size=32 B, count=1, average=32 B

import time
import statistics
from functools import wraps

def benchmark(fun,times=10,result_enable=False):
    @wraps(fun)
    def benchmark_function(*args,**kwargs):
        print('current fun:',fun.__name__)
        print('arguments"',args)
        print('key arguments:',kwargs)
        results = []
        gstart = time.time()
        for _ in range(times):
            start = time.time()
            count = len(fun(*args,**kwargs))
            end = time.time()
            results.append(end-start)
        gend = time.time()
        mean = statistics.mean(results)
        stdev = statistics.stdev(results)
        perc = (stdev * 100)/ mean
        print ("Benchmark duration: {} seconds".format(gend-gstart))
        print ("Mean duration: {} seconds".format(mean))
        print ("Standard deviation: {} ({} %)".format(stdev, perc))
        return (gend-gstart,mean,stdev,perc)
    return benchmark_function

@benchmark
def primes(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    return [2]+[x for x in s if x]

primes(1000000)

current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.3134047985076904 seconds
Mean duration: 0.1313349962234497 seconds
Standard deviation: 0.026443051577771697 (20.13404830254247 %)





(1.3134047985076904,
 0.1313349962234497,
 0.026443051577771697,
 20.13404830254247)

!cpupower frequency-info

analyzing CPU 0:
  driver: intel_pstate
  CPUs which run at the same hardware frequency: 0
  CPUs which need to have their frequency coordinated by software: 0
  maximum transition latency:  Cannot determine or is not supported.
  hardware limits: 400 MHz - 3.10 GHz
  available cpufreq governors: performance powersave
  current policy: frequency should be within 3.10 GHz and 3.10 GHz.
                  The governor "performance" may decide which speed to use
                  within this range.
  current CPU frequency: Unable to call hardware
  current CPU frequency: 3.10 GHz (asserted by call to kernel)
  boost state support:
    Supported: yes
    Active: yes

performance – run the CPU at the maximum frequency.
powersave – run the CPU at the minimum frequency.
userspace – run the CPU at user specified frequencies.
ondemand – scales the frequency dynamically according to current load. Jumps to the highest frequency and then possibly back off as the idle time increases.
conservative – scales the frequency dynamically according to current load. Scales the frequency more gradually than ondemand.

change the cpu performance

cpupower frequency-set -g performance
cpupower frequency-set --min 3100000 --max 3100000

!cpupower frequency-info

analyzing CPU 0:
  driver: intel_pstate
  CPUs which run at the same hardware frequency: 0
  CPUs which need to have their frequency coordinated by software: 0
  maximum transition latency:  Cannot determine or is not supported.
  hardware limits: 400 MHz - 3.10 GHz
  available cpufreq governors: performance powersave
  current policy: frequency should be within 3.10 GHz and 3.10 GHz.
                  The governor "performance" may decide which speed to use
                  within this range.
  current CPU frequency: Unable to call hardware
  current CPU frequency: 3.10 GHz (asserted by call to kernel)
  boost state support:
    Supported: yes
    Active: yes

The power governor has a great impact on how a CPU is used. By default, the governor is set to automatically scale the frequency to reduce the power consumption. We do not want this on our system and we proceed by disabling it from GRUB. Just edit /boot/grub/grub.cfg (but if you do be careful that on a kernel upgrade, this will be gone) or create a new kernel entry in /etc/grub.d/40_custom. Our boot line must contain the following flag: intel_pstate=disable, like this:

linux   /boot/vmlinuz-4.4.0-78-generic.efi.signed root=UUID=86097ec1-3fa4-4d00-97c7-3bf91787be83 ro intel_pstate=disable quiet splash $vt_handoff

ASLR (Address Space Layout Randomizer)

close aslr

# disable
sudo bash -c 'echo 0 >| /proc/sys/kernel/randomize_va_space'
# enable
sudo bash -c 'echo 2 >| /proc/sys/kernel/randomize_va_space'

%%prun 
primes(1000000)

current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.2920074462890625 seconds
Mean duration: 0.12919511795043945 seconds
Standard deviation: 0.006949116079384523 (5.378776063388304 %)

%%time 
primes(10000000)

current fun: primes
arguments" (10000000,)
key arguments: {}
Benchmark duration: 15.899296283721924 seconds
Mean duration: 1.5899237871170044 seconds
Standard deviation: 0.08453717221379202 (5.317058144471351 %)
CPU times: user 15.7 s, sys: 156 ms, total: 15.9 s
Wall time: 15.9 s





(15.899296283721924,
 1.5899237871170044,
 0.08453717221379202,
 5.317058144471351)

%%timeit
primes(1000000)

current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.3215422630310059 seconds
Mean duration: 0.1321493625640869 seconds
Standard deviation: 0.0034519213628097308 (2.6121362190723345 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.3617463111877441 seconds
Mean duration: 0.13617000579833985 seconds
Standard deviation: 0.008070226512751822 (5.926581603222794 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.4243576526641846 seconds
Mean duration: 0.14243073463439943 seconds
Standard deviation: 0.010772609218486329 (7.563402130964339 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.5485849380493164 seconds
Mean duration: 0.1548527717590332 seconds
Standard deviation: 0.00691087803664961 (4.4628700914722055 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.351806402206421 seconds
Mean duration: 0.13517611026763915 seconds
Standard deviation: 0.009168839147290404 (6.782884290084062 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.2886924743652344 seconds
Mean duration: 0.12886531352996827 seconds
Standard deviation: 0.0032479093750779576 (2.5203906979379984 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.3597874641418457 seconds
Mean duration: 0.13597421646118163 seconds
Standard deviation: 0.01708626966270827 (12.565815863764227 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 1.395094871520996 seconds
Mean duration: 0.13950414657592775 seconds
Standard deviation: 0.01917858677169858 (13.747682231983173 %)
1.39 s ± 75.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%%writefile test_primes.py
import time
import statistics
from functools import wraps

def benchmark(fun,times=10,result_enable=False):
    @wraps(fun)
    def benchmark_function(*args,**kwargs):
        print('current fun:',fun.__name__)
        print('arguments"',args)
        print('key arguments:',kwargs)
        results = []
        gstart = time.time()
        for _ in range(times):
            start = time.time()
            count = len(fun(*args,**kwargs))
            end = time.time()
            results.append(end-start)
        gend = time.time()
        mean = statistics.mean(results)
        stdev = statistics.stdev(results)
        perc = (stdev * 100)/ mean
        print ("Benchmark duration: {} seconds".format(gend-gstart))
        print ("Mean duration: {} seconds".format(mean))
        print ("Standard deviation: {} ({} %)".format(stdev, perc))
        return (gend-gstart,mean,stdev,perc)
    return benchmark_function

@benchmark
def primes(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    return [2]+[x for x in s if x]

primes(1000000)

Writing test_primes.py

!pypy -V

Python 3.5.3 (2875f328eae2, Apr 02 2017, 17:58:49)
[PyPy 5.7.1-beta0 with GCC 6.2.0 20160901]

!pypy test_primes.py

current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.13382816314697266 seconds
Mean duration: 0.013373184204101562 seconds
Standard deviation: 0.0024842894560805 (18.576648748460123 %)

%%timeit
!pypy test_primes.py

current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.1339244842529297 seconds
Mean duration: 0.013384270668029784 seconds
Standard deviation: 0.003059768676200482 (22.86092946034908 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.12753796577453613 seconds
Mean duration: 0.01274564266204834 seconds
Standard deviation: 0.0021554952747125347 (16.911624873422642 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.12777066230773926 seconds
Mean duration: 0.012769460678100586 seconds
Standard deviation: 0.002132414121722897 (16.69932799417247 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.1306934356689453 seconds
Mean duration: 0.013060450553894043 seconds
Standard deviation: 0.002033298816640572 (15.56836656017455 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.13000965118408203 seconds
Mean duration: 0.012992572784423829 seconds
Standard deviation: 0.002555783370927481 (19.671110667099644 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.12881755828857422 seconds
Mean duration: 0.012873315811157226 seconds
Standard deviation: 0.002499031533282555 (19.412493019993025 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.13097691535949707 seconds
Mean duration: 0.0130889892578125 seconds
Standard deviation: 0.002260982415565542 (17.273926741257096 %)
current fun: primes
arguments" (1000000,)
key arguments: {}
Benchmark duration: 0.1263427734375 seconds
Mean duration: 0.01262507438659668 seconds
Standard deviation: 0.0020692545801631254 (16.390038718187157 %)
333 ms ± 9.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Measuring Memory Consumption

pip install memory_profiler
pip install psutil

from memory_profiler import profile

%%writefile primes1.py
from memory_profiler import profile


@profile(precision=6)
def primes1(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    return [2]+[x for x in s if x]

len(primes1(100000))

Overwriting primes1.py

%run primes1.py

Filename: /home/splasky/juypter-notebook/primes1.py

Line #    Mem usage    Increment   Line Contents
================================================
     4  49.328125 MiB   0.000000 MiB   @profile(precision=6)
     5                             def primes1(n): 
     6  49.328125 MiB   0.000000 MiB       if n==2:
     7                                     return [2]
     8  49.328125 MiB   0.000000 MiB       elif n<2:
     9                                     return []
    10  51.039062 MiB   1.710938 MiB       s=list(range(3,n+1,2))
    11  51.039062 MiB   0.000000 MiB       mroot = n ** 0.5
    12  51.039062 MiB   0.000000 MiB       half=(n+1)//2-1
    13  51.039062 MiB   0.000000 MiB       i=0
    14  51.039062 MiB   0.000000 MiB       m=3
    15  51.039062 MiB   0.000000 MiB       while m <= mroot:
    16  51.039062 MiB   0.000000 MiB           if s[i]:
    17  51.039062 MiB   0.000000 MiB               j=(m*m-3)//2
    18  51.039062 MiB   0.000000 MiB               s[j]=0
    19  51.039062 MiB   0.000000 MiB               while j<half:
    20  51.039062 MiB   0.000000 MiB                   s[j]=0
    21  51.039062 MiB   0.000000 MiB                   j+=m
    22  51.039062 MiB   0.000000 MiB           i=i+1
    23  51.039062 MiB   0.000000 MiB           m=2*i+3
    24  51.296875 MiB   0.257812 MiB       return [2]+[x for x in s if x]

using pypy

pypy -m memory_profiler primes1.py

%%writefile primes2.py
# improve by range call
from memory_profiler import profile


@profile(precision=6)
def primes2(n): 
    if n==2:
        return [2]
    elif n<2:
        return []
    s=list(range(3,n+1,2))
    mroot = n ** 0.5
    half=(n+1)//2-1
    i=0
    m=3
    while m <= mroot:
        if s[i]:
            j=(m*m-3)//2
            s[j]=0
            while j<half:
                s[j]=0
                j+=m
        i=i+1
        m=2*i+3
    l = [2]
    for x in s:
        if x:
            l.append(x)
    return l


len(primes2(100000))

Overwriting primes2.py

Visual Studio Code - Python debugging - How to step into external functions/packages?

"justMyCode": false

profiling in python

tags: python

Hunting Performance in Python Code

change the cpu performance

ASLR (Address Space Layout Randomizer)

Measuring Memory Consumption

using pypy

Visual Studio Code - Python debugging - How to step into external functions/packages?

Read more

如何利用pelican 建立自己的blog

How to c in 2016

有趣的memory alignment

ARM

tags: `python`