mirror of
https://github.com/redoules/redoules.github.io.git
synced 2025-12-12 15:59:34 +00:00
419 lines
19 KiB
HTML
419 lines
19 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="fr">
|
|
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
|
|
<meta name="description" content="Data Science for Political and Social Phenomena">
|
|
<meta name="author" content="Guillaume Redoulès">
|
|
<link rel="icon" href="../favicon.ico">
|
|
|
|
<title>Introduction to dask arrays - Python</title>
|
|
|
|
<!-- JQuery -->
|
|
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
|
|
<script>
|
|
window.jQuery || document.write('<script src="../theme/js/jquery.min.js"><\/script>')
|
|
</script>
|
|
|
|
<!-- Bootstrap core CSS -->
|
|
<link rel="stylesheet" href="../theme/css/bootstrap.css" />
|
|
<!-- IE10 viewport hack for Surface/desktop Windows 8 bug -->
|
|
<link rel="stylesheet" type="text/css" href="../theme/css/ie10-viewport-bug-workaround.css" />
|
|
<!-- Custom styles for this template -->
|
|
<link rel="stylesheet" type="text/css" href="../theme/css/style.css" />
|
|
<link rel="stylesheet" type="text/css" href="../theme/css/notebooks.css" />
|
|
<link href='https://fonts.googleapis.com/css?family=PT+Serif:400,700|Roboto:400,500,700' rel='stylesheet' type='text/css'>
|
|
|
|
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
|
|
<!--[if lt IE 9]>
|
|
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
|
|
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
|
|
<![endif]-->
|
|
|
|
|
|
<meta name="tags" content="Parallel" />
|
|
|
|
|
|
</head>
|
|
|
|
<body>
|
|
|
|
<div class="navbar navbar-fixed-top">
|
|
<div class="container">
|
|
<div class="navbar-header">
|
|
<button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
|
|
<span class="icon-bar"></span>
|
|
<span class="icon-bar"></span>
|
|
<span class="icon-bar"></span>
|
|
</button>
|
|
<a class="navbar-brand" href="..">Guillaume Redoulès</a>
|
|
</div>
|
|
<div class="navbar-collapse collapse" id="searchbar">
|
|
|
|
<ul class="nav navbar-nav navbar-right">
|
|
<li class="dropdown">
|
|
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">About<span class="caret"></span></a>
|
|
<ul class="dropdown-menu">
|
|
<li><a href="../pages/about.html">About Guillaume</a></li>
|
|
<li><a href="https://github.com/redoules">GitHub</a></li>
|
|
<li><a href="https://www.linkedin.com/in/guillaume-redoul%C3%A8s-33923860/">LinkedIn</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="dropdown">
|
|
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Data Science<span class="caret"></span></a>
|
|
<ul class="dropdown-menu">
|
|
<li><a href="..#Blog">Blog</a></li>
|
|
<li><a href="..#Python">Python</a></li>
|
|
<li><a href="..#Bash">Bash</a></li>
|
|
<li><a href="..#SQL">SQL</a></li>
|
|
<li><a href="..#Mathematics">Mathematics</a></li>
|
|
<li><a href="..#Machine_Learning">Machine Learning</a></li>
|
|
<li><a href="..#Projects">Projects</a></li>
|
|
</ul>
|
|
</li>
|
|
<li class="dropdown">
|
|
<a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="false">Projects<span class="caret"></span></a>
|
|
<ul class="dropdown-menu">
|
|
<li><a href="https://github.com/redoules/redoules.github.io">Notes (Github)</a></li>
|
|
</ul>
|
|
</li>
|
|
|
|
<!--<li class="dropdown">
|
|
<a href="../feeds/blog.rss.xml">Blog RSS</a>
|
|
</li>-->
|
|
|
|
|
|
</ul>
|
|
|
|
<form class="navbar-form" action="../search.html" onsubmit="return validateForm(this.elements['q'].value);">
|
|
<div class="form-group" style="display:inline;">
|
|
<div class="input-group" style="display:table;">
|
|
<span class="input-group-addon" style="width:1%;"><span class="glyphicon glyphicon-search"></span></span>
|
|
<input class="form-control search-query" name="q" id="tipue_search_input" placeholder="e.g. scikit KNN, pandas merge" required autocomplete="off" type="text">
|
|
</div>
|
|
</div>
|
|
</form>
|
|
|
|
</div>
|
|
<!--/.nav-collapse -->
|
|
</div>
|
|
</div>
|
|
|
|
|
|
|
|
<!-- end of header section -->
|
|
<div class="container">
|
|
<!-- <div class="alert alert-warning" role="alert">
|
|
Did you find this page useful? Please do me a quick favor and <a href="#" class="alert-link">endorse me for data science on LinkedIn</a>.
|
|
</div> -->
|
|
<section id="content" class="body">
|
|
<header>
|
|
<h1>
|
|
Introduction to dask arrays
|
|
</h1>
|
|
<ol class="breadcrumb">
|
|
<li>
|
|
<time class="published" datetime="2019-12-07T19:02:00+01:00">
|
|
07 décembre 2019
|
|
</time>
|
|
</li>
|
|
<li>Python</li>
|
|
<li>Parallel</li>
|
|
</ol>
|
|
</header>
|
|
<div class='article_content'>
|
|
<p>Dask arrays extend the numpy interface to larger than memory and parallel workflows across a distributed cluster. They look and feel a lot like numpy and use numpy under the hood.
|
|
Indeed Dask arrays coordinate many NumPy arrays arranged into a grid. These NumPy arrays may live on disk or on other machines.</p>
|
|
<p><img alt="daskarray" src="../images/daskarray/dask-array-black-text.svg"></p>
|
|
<p>You can create a dask array like following. We create an array filled with 1 of lenght 15. We have to specify a chunk size.</p>
|
|
<div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">dask.array</span> <span class="k">as</span> <span class="nn">da</span>
|
|
<span class="n">x</span> <span class="o">=</span> <span class="n">da</span><span class="o">.</span><span class="n">ones</span><span class="p">(</span><span class="mi">15</span><span class="p">,</span> <span class="n">chunks</span><span class="o">=</span><span class="p">(</span><span class="mi">5</span><span class="p">,))</span>
|
|
<span class="n">x</span>
|
|
</pre></div>
|
|
|
|
|
|
<table>
|
|
<tr>
|
|
<td>
|
|
<table>
|
|
<thead>
|
|
<tr><td> </td><th> Array </th><th> Chunk </th></tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr><th> Bytes </th><td> 120 B </td> <td> 40 B </td></tr>
|
|
<tr><th> Shape </th><td> (15,) </td> <td> (5,) </td></tr>
|
|
<tr><th> Count </th><td> 3 Tasks </td><td> 3 Chunks </td></tr>
|
|
<tr><th> Type </th><td> float64 </td><td> numpy.ndarray </td></tr>
|
|
</tbody>
|
|
</table>
|
|
</td>
|
|
<td>
|
|
<svg width="170" height="86" style="stroke:rgb(0,0,0);stroke-width:1" >
|
|
|
|
<!-- Horizontal lines -->
|
|
<line x1="0" y1="0" x2="120" y2="0" style="stroke-width:2" />
|
|
<line x1="0" y1="36" x2="120" y2="36" style="stroke-width:2" />
|
|
|
|
<!-- Vertical lines -->
|
|
<line x1="0" y1="0" x2="0" y2="36" style="stroke-width:2" />
|
|
<line x1="40" y1="0" x2="40" y2="36" />
|
|
<line x1="80" y1="0" x2="80" y2="36" />
|
|
<line x1="120" y1="0" x2="120" y2="36" style="stroke-width:2" />
|
|
|
|
<!-- Colored Rectangle -->
|
|
<polygon points="0.000000,0.000000 120.000000,0.000000 120.000000,36.247830 0.000000,36.247830" style="fill:#ECB172A0;stroke-width:0"/>
|
|
|
|
<!-- Text -->
|
|
<text x="60.000000" y="56.247830" font-size="1.0rem" font-weight="100" text-anchor="middle" >15</text>
|
|
<text x="140.000000" y="18.123915" font-size="1.0rem" font-weight="100" text-anchor="middle" transform="rotate(0,140.000000,18.123915)">1</text>
|
|
</svg>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<p>The output is a dask array composed of 3 numpy arrays of size 5 each.
|
|
If we try to compute the sum of all the elements of the array we won't get the result by using the <code>sum</code> method. Indeed, Dask objects are lazy by default and run the computations only when instructed.</p>
|
|
<div class="highlight"><pre><span></span><span class="n">x</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span>
|
|
</pre></div>
|
|
|
|
|
|
<table>
|
|
<tr>
|
|
<td>
|
|
<table>
|
|
<thead>
|
|
<tr><td> </td><th> Array </th><th> Chunk </th></tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr><th> Bytes </th><td> 8 B </td> <td> 8 B </td></tr>
|
|
<tr><th> Shape </th><td> () </td> <td> () </td></tr>
|
|
<tr><th> Count </th><td> 7 Tasks </td><td> 1 Chunks </td></tr>
|
|
<tr><th> Type </th><td> float64 </td><td> numpy.ndarray </td></tr>
|
|
</tbody>
|
|
</table>
|
|
</td>
|
|
<td>
|
|
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<p>We have to call compute at the end of each operation if we want the result. </p>
|
|
<div class="highlight"><pre><span></span><span class="n">x</span><span class="o">.</span><span class="n">sum</span><span class="p">()</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span>
|
|
</pre></div>
|
|
|
|
|
|
<div class="highlight"><pre><span></span><span class="err">15.0</span>
|
|
</pre></div>
|
|
|
|
|
|
<p>The above example is pretty trival. Let's say now that we have a 10000 by 10000 array. We choose to represente that array by chunks of 1000 by 1000.</p>
|
|
<div class="highlight"><pre><span></span><span class="n">x</span> <span class="o">=</span> <span class="n">da</span><span class="o">.</span><span class="n">random</span><span class="o">.</span><span class="n">random</span><span class="p">((</span><span class="mi">10000</span><span class="p">,</span><span class="mi">10000</span><span class="p">),</span> <span class="n">chunks</span><span class="o">=</span><span class="p">(</span><span class="mi">1000</span><span class="p">,</span><span class="mi">1000</span><span class="p">))</span>
|
|
<span class="n">x</span>
|
|
</pre></div>
|
|
|
|
|
|
<table>
|
|
<tr>
|
|
<td>
|
|
<table>
|
|
<thead>
|
|
<tr><td> </td><th> Array </th><th> Chunk </th></tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr><th> Bytes </th><td> 800.00 MB </td> <td> 8.00 MB </td></tr>
|
|
<tr><th> Shape </th><td> (10000, 10000) </td> <td> (1000, 1000) </td></tr>
|
|
<tr><th> Count </th><td> 100 Tasks </td><td> 100 Chunks </td></tr>
|
|
<tr><th> Type </th><td> float64 </td><td> numpy.ndarray </td></tr>
|
|
</tbody>
|
|
</table>
|
|
</td>
|
|
<td>
|
|
<svg width="170" height="170" style="stroke:rgb(0,0,0);stroke-width:1" >
|
|
|
|
<!-- Horizontal lines -->
|
|
<line x1="0" y1="0" x2="120" y2="0" style="stroke-width:2" />
|
|
<line x1="0" y1="12" x2="120" y2="12" />
|
|
<line x1="0" y1="24" x2="120" y2="24" />
|
|
<line x1="0" y1="36" x2="120" y2="36" />
|
|
<line x1="0" y1="48" x2="120" y2="48" />
|
|
<line x1="0" y1="60" x2="120" y2="60" />
|
|
<line x1="0" y1="72" x2="120" y2="72" />
|
|
<line x1="0" y1="84" x2="120" y2="84" />
|
|
<line x1="0" y1="96" x2="120" y2="96" />
|
|
<line x1="0" y1="108" x2="120" y2="108" />
|
|
<line x1="0" y1="120" x2="120" y2="120" style="stroke-width:2" />
|
|
|
|
<!-- Vertical lines -->
|
|
<line x1="0" y1="0" x2="0" y2="120" style="stroke-width:2" />
|
|
<line x1="12" y1="0" x2="12" y2="120" />
|
|
<line x1="24" y1="0" x2="24" y2="120" />
|
|
<line x1="36" y1="0" x2="36" y2="120" />
|
|
<line x1="48" y1="0" x2="48" y2="120" />
|
|
<line x1="60" y1="0" x2="60" y2="120" />
|
|
<line x1="72" y1="0" x2="72" y2="120" />
|
|
<line x1="84" y1="0" x2="84" y2="120" />
|
|
<line x1="96" y1="0" x2="96" y2="120" />
|
|
<line x1="108" y1="0" x2="108" y2="120" />
|
|
<line x1="120" y1="0" x2="120" y2="120" style="stroke-width:2" />
|
|
|
|
<!-- Colored Rectangle -->
|
|
<polygon points="0.000000,0.000000 120.000000,0.000000 120.000000,120.000000 0.000000,120.000000" style="fill:#ECB172A0;stroke-width:0"/>
|
|
|
|
<!-- Text -->
|
|
<text x="60.000000" y="140.000000" font-size="1.0rem" font-weight="100" text-anchor="middle" >10000</text>
|
|
<text x="140.000000" y="60.000000" font-size="1.0rem" font-weight="100" text-anchor="middle" transform="rotate(-90,140.000000,60.000000)">10000</text>
|
|
</svg>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<p>Dask has created a 10 by 10 grid where each element of that list is a 1000 by 1000 numpy array.</p>
|
|
<p>Again, we can do operations on that dataset that are very similar to the way we would do it with numpy.</p>
|
|
<p>First let's add the array to its transpose</p>
|
|
<div class="highlight"><pre><span></span><span class="n">y</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="n">x</span><span class="o">.</span><span class="n">T</span>
|
|
</pre></div>
|
|
|
|
|
|
<p>Then slice the array and take the mean.</p>
|
|
<div class="highlight"><pre><span></span><span class="n">z</span> <span class="o">=</span> <span class="n">y</span><span class="p">[::</span><span class="mi">2</span><span class="p">,</span> <span class="p">:]</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
|
|
<span class="n">z</span>
|
|
</pre></div>
|
|
|
|
|
|
<table>
|
|
<tr>
|
|
<td>
|
|
<table>
|
|
<thead>
|
|
<tr><td> </td><th> Array </th><th> Chunk </th></tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr><th> Bytes </th><td> 40.00 kB </td> <td> 4.00 kB </td></tr>
|
|
<tr><th> Shape </th><td> (5000,) </td> <td> (500,) </td></tr>
|
|
<tr><th> Count </th><td> 540 Tasks </td><td> 10 Chunks </td></tr>
|
|
<tr><th> Type </th><td> float64 </td><td> numpy.ndarray </td></tr>
|
|
</tbody>
|
|
</table>
|
|
</td>
|
|
<td>
|
|
<svg width="170" height="75" style="stroke:rgb(0,0,0);stroke-width:1" >
|
|
|
|
<!-- Horizontal lines -->
|
|
<line x1="0" y1="0" x2="120" y2="0" style="stroke-width:2" />
|
|
<line x1="0" y1="25" x2="120" y2="25" style="stroke-width:2" />
|
|
|
|
<!-- Vertical lines -->
|
|
<line x1="0" y1="0" x2="0" y2="25" style="stroke-width:2" />
|
|
<line x1="12" y1="0" x2="12" y2="25" />
|
|
<line x1="24" y1="0" x2="24" y2="25" />
|
|
<line x1="36" y1="0" x2="36" y2="25" />
|
|
<line x1="48" y1="0" x2="48" y2="25" />
|
|
<line x1="60" y1="0" x2="60" y2="25" />
|
|
<line x1="72" y1="0" x2="72" y2="25" />
|
|
<line x1="84" y1="0" x2="84" y2="25" />
|
|
<line x1="96" y1="0" x2="96" y2="25" />
|
|
<line x1="108" y1="0" x2="108" y2="25" />
|
|
<line x1="120" y1="0" x2="120" y2="25" style="stroke-width:2" />
|
|
|
|
<!-- Colored Rectangle -->
|
|
<polygon points="0.000000,0.000000 120.000000,0.000000 120.000000,25.412617 0.000000,25.412617" style="fill:#ECB172A0;stroke-width:0"/>
|
|
|
|
<!-- Text -->
|
|
<text x="60.000000" y="45.412617" font-size="1.0rem" font-weight="100" text-anchor="middle" >5000</text>
|
|
<text x="140.000000" y="12.706308" font-size="1.0rem" font-weight="100" text-anchor="middle" transform="rotate(0,140.000000,12.706308)">1</text>
|
|
</svg>
|
|
</td>
|
|
</tr>
|
|
</table>
|
|
|
|
<p>When we want to get access to the result, we just need to use the <code>compute</code> method and dask will compute the result in parallel on the different cores of the machine.</p>
|
|
<div class="highlight"><pre><span></span><span class="n">z</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span>
|
|
</pre></div>
|
|
|
|
|
|
<div class="highlight"><pre><span></span><span class="err">array([0.99874649, 0.99722168, 0.99725464, ..., 1.00849801, 1.00448204,</span>
|
|
<span class="err"> 0.99683664])</span>
|
|
</pre></div>
|
|
|
|
|
|
<p>In practice, dask is often used in tandem with data file formats like HDF5, zar or netcdf.</p>
|
|
<p>In this situation you might load a file from disk and use the <code>from_array</code> function.</p>
|
|
<div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">h5py</span>
|
|
<span class="n">f</span> <span class="o">=</span> <span class="n">h5py</span><span class="o">.</span><span class="n">File</span><span class="p">(</span><span class="s2">"myfile.hdf5"</span><span class="p">)</span>
|
|
<span class="n">d</span> <span class="o">=</span> <span class="n">f</span><span class="p">[</span><span class="s2">"/data/path"</span><span class="p">]</span>
|
|
<span class="n">d</span><span class="o">.</span><span class="n">shape</span>
|
|
<span class="p">(</span><span class="mi">10000000</span><span class="p">,</span><span class="mi">1000000</span><span class="p">)</span>
|
|
|
|
<span class="kn">import</span> <span class="nn">dask.array</span> <span class="k">as</span> <span class="nn">da</span>
|
|
<span class="n">x</span> <span class="o">=</span> <span class="n">da</span><span class="o">.</span><span class="n">from_array</span><span class="p">(</span><span class="n">d</span><span class="p">,</span> <span class="n">chunks</span><span class="o">=</span><span class="p">(</span><span class="mi">10000</span><span class="p">,</span> <span class="mi">10000</span><span class="p">))</span>
|
|
<span class="n">x</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">compute</span><span class="p">()</span>
|
|
</pre></div>
|
|
|
|
|
|
<div class="highlight"><pre><span></span>
|
|
</pre></div>
|
|
</div>
|
|
<aside>
|
|
<div class="bug-reporting__panel">
|
|
<h3>Find an error or bug? Have a suggestion?</h3>
|
|
<p>Everything on this site is avaliable on GitHub. Head on over and <a href='https://github.com/redoules/redoules.github.io/issues/new'>submit an issue.</a> You can also message me directly by <a href='mailto:guillaume.redoules@gadz.org'>email</a>.</p>
|
|
</div>
|
|
</aside>
|
|
</section>
|
|
|
|
</div>
|
|
<!-- start of footer section -->
|
|
<footer class="footer">
|
|
<div class="container">
|
|
<p class="text-muted">
|
|
<center>This project contains 119 pages and is available on <a href="https://github.com/redoules/redoules.github.io">GitHub</a>.
|
|
<br/>
|
|
Copyright © Guillaume Redoulès,
|
|
<time datetime="2018">2018</time>.
|
|
</center>
|
|
</p>
|
|
</div>
|
|
</footer>
|
|
|
|
<!-- This jQuery line finds any span that contains code highlighting classes and then selects the parent <pre> tag and adds a border. This is done as a workaround to visually distinguish the code inputs and outputs -->
|
|
<script>
|
|
$( ".hll, .n, .c, .err, .k, .o, .cm, .cp, .c1, .cs, .gd, .ge, .gr, .gh, .gi, .go, .gp, .gs, .gu, .gt, .kc, .kd, .kn, .kp, .kr, .kt, .m, .s, .na, .nb, .nc, .no, .nd, .ni, .ne, .nf, .nl, .nn, .nt, .nv, .ow, .w, .mf, .mh, .mi, .mo, .sb, .sc, .sd, .s2, .se, .sh, .si, .sx, .sr, .s1, .ss, .bp, .vc, .vg, .vi, .il" ).parent( "pre" ).css( "border", "1px solid #DEDEDE" );
|
|
</script>
|
|
|
|
|
|
<!-- Load Google Analytics -->
|
|
<script>
|
|
/*
|
|
(function(i, s, o, g, r, a, m) {
|
|
i['GoogleAnalyticsObject'] = r;
|
|
i[r] = i[r] || function() {
|
|
(i[r].q = i[r].q || []).push(arguments)
|
|
}, i[r].l = 1 * new Date();
|
|
a = s.createElement(o),
|
|
m = s.getElementsByTagName(o)[0];
|
|
a.async = 1;
|
|
a.src = g;
|
|
m.parentNode.insertBefore(a, m)
|
|
})(window, document, 'script', '//www.google-analytics.com/analytics.js', 'ga');
|
|
|
|
ga('create', 'UA-66582-32', 'auto');
|
|
ga('send', 'pageview');
|
|
*/
|
|
</script>
|
|
<!-- End of Google Analytics -->
|
|
|
|
<!-- Bootstrap core JavaScript
|
|
================================================== -->
|
|
<!-- Placed at the end of the document so the pages load faster -->
|
|
<script src="../theme/js/bootstrap.min.js"></script>
|
|
<!-- IE10 viewport hack for Surface/desktop Windows 8 bug -->
|
|
<script src="../theme/js/ie10-viewport-bug-workaround.js"></script>
|
|
|
|
|
|
</body>
|
|
|
|
</html> |