-- Introduction au Python scientifique --

Exploitation des données
de Top500.org

Utilisation de Pandas et etree pour tracer des données xml

simon.marie@lecnam.net

import matplotlib
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('fivethirtyeight')
import xml.etree.ElementTree as ET
import pandas as pd
%matplotlib inline

On s'interesse d'abord à la dernière liste publiée sur top500.org

year=2016
mon=11
source1='TOP500_'+str(year)+str(mon)+'_all.xml'

def top_data(source):
    tree = ET.parse(source).getroot()
    namespace={'top500': 'http://www.top500.org/xml/top500/1.0'}
    df = pd.DataFrame(columns=('Rank','System-name','Computer', 'Country','State','City','R-max', 'R-peak', 'Number-of-processors','Power'))
    Europe=['Switzerland','France','Germany','Italy','Spain','United Kingdom','Sweden','Netherlands','Ireland','Belgium','Norway','Denmark','Finland','Austria','Czech Republic','Poland']
    for site in tree:
        if site.find('top500:power',namespace).text==None:
           site.find('top500:power',namespace).text='0'
        if site.find('top500:country',namespace).text in Europe:
           site.find('top500:state',namespace).text=site.find('top500:country',namespace).text
           site.find('top500:country',namespace).text='Europe'
           
        row = dict(zip(['Rank','System-name','Computer', 'Country','State','City','R-max', 'R-peak', 'Number-of-processors','Power'],
                   [site.find('top500:rank',namespace).text,site.find('top500:system-name',namespace).text,
                    site.find('top500:computer',namespace).text,site.find('top500:country',namespace).text,
                    site.find('top500:state',namespace).text,site.find('top500:town',namespace).text,
                    float(site.find('top500:r-max',namespace).text)/1e6,
                    float(site.find('top500:r-peak',namespace).text)/1e6,
                    float(site.find('top500:number-of-processors',namespace).text)/1e6,
                    float(site.find('top500:power',namespace).text)]))
        row_s = pd.Series(row)
        row_s.name = site[1].text
        df = df.append(row_s)
    return df

df=top_data(source1)

df[:10][['Rank','Country','Computer','System-name','R-peak','Number-of-processors','Power']].to_csv('top10.csv')
df[df['Country']=='Europe'][['Rank','State','City','System-name','R-peak','Number-of-processors','Power']][:10].to_csv('top10_eu.csv')
df[df['State'] == 'France'][:10].to_csv('top10_fr.csv')

print df[:10][['Rank','Country','System-name','R-peak','Power']]

       Rank        Country            System-name      R-peak    Power
178764    1          China      Sunway TaihuLight  125.435904  15371.0
177999    2          China  Tianhe-2 (MilkyWay-2)   54.902400  17808.0
177975    3  United States                  Titan   27.112550   8209.0
177556    4  United States                Sequoia   20.132659   7890.0
178924    5  United States                   Cori   27.880653   3939.0
178932    6          Japan         Oakforest-PACS   24.913459   2718.7
177232    7          Japan                   None   11.280384  12659.9
177824    8         Europe              Piz Daint   15.987968   1312.0
177718    9  United States                   Mira   10.066330   3945.0
178610   10  United States                Trinity   11.078861   4232.6

print df[df['Country']=='Europe'][['Rank','State','System-name','R-peak','Power']][:10]

       Rank           State             System-name     R-peak   Power
177824    8     Switzerland               Piz Daint  15.987968  1312.0
178925   11  United Kingdom                    None   8.128512     0.0
178937   12           Italy  Marconi Intel Xeon Phi  10.832998     0.0
178446   14         Germany               Hazel Hen   7.403520  3615.0
178071   16          France                  Pangea   6.712320  4150.0
177722   19         Germany                 JUQUEEN   5.872026  2301.0
178431   23  United Kingdom                    None   4.249325  1897.0
178749   24  United Kingdom                    None   4.249325  1897.0
178425   29           Italy                    HPC2   4.605000  1227.0
178567   34         Germany                 Mistral   3.962880  1276.0

print df[df['State']=='France'][['Rank','City','System-name','R-peak','Power']][:10]

       Rank                City       System-name    R-peak   Power
178071   16                 Pau            Pangea  6.712320  4150.0
178429   50            Toulouse           Prolix2  2.534400   830.4
178962   51            Toulouse          Beaufix2  2.585088   830.2
178790   55  Bruyeres-le-Chatel       Tera-1000-1  2.586010  1042.0
178465   64                None           Occigen  2.102630   934.8
178811   73              Angers               Sid  1.676506   543.0
177818   74  Bruyeres-le-Chatel  Curie thin nodes  1.667174  2132.0
178806   75  Bruyeres-le-Chatel            Cobalt  1.479475   539.0
178791   83              Angers             Diego  1.647360   472.0
177728   99                None            Turing  1.258291   493.0

On peut représenter ça de façon synthétique avec des histogrammes:

main_zone=['Europe','China','United States','Japan','Russia','Canada','India','Saudi Arabia']
Classmt_Country=df.groupby('Country').sum()

Classmt_Country=df.groupby('Country').sum().sort_values('Number-of-processors')
Classmt_Country['Number-of-processors'][-10:].plot(kind='barh')
plt.xlabel('Processors in Million')
plt.savefig('Procs_'+str(year)+'_'+str(mon)+'.png',bbox_inches='tight',dpi=120)

Classmt_Country['R-peak'][-10:].plot(kind='barh')
plt.xlabel('R-peak in Peta-Flops')
plt.savefig('Rpeak_'+str(year)+'_'+str(mon)+'.png',bbox_inches='tight',dpi=120)

Classmt_Country['R-max'][-10:].plot(kind='barh')
plt.xlabel('R-max in PFlops')
plt.savefig('Rmax_'+str(year)+'_'+str(mon)+'.png',bbox_inches='tight',dpi=120)

Maintenant on va s'intéresser à l'évolution des classements dans le temps:

def time_data(source):
    tree = ET.parse(source).getroot()
    namespace={'top500': 'http://www.top500.org/xml/top500/1.0'}
    df = pd.DataFrame(columns=('Rank','Country','R-max', 'R-peak','Number-of-processors'))
    Europe=['Switzerland','France','Germany','Italy','Spain','United Kingdom','Sweden','Netherlands','Ireland','Belgium','Norway','Denmark','Finland','Austria','Czech Republic','Poland']
    for site in tree:
        if site.find('top500:country',namespace).text in Europe:
            site.find('top500:country',namespace).text='Europe'
        row = dict(zip(['Rank','Country','R-max', 'R-peak','Number-of-processors'],
                   [site.find('top500:rank',namespace).text,site.find('top500:country',namespace).text,
                    float(site.find('top500:r-max',namespace).text)/1e6,
                    float(site.find('top500:r-peak',namespace).text)/1e6,
                    float(site.find('top500:number-of-processors',namespace).text)/1e6]))
        row_s = pd.Series(row)
        row_s.name = site[1].text
        df = df.append(row_s)
    return df


main_zone=['Europe','China','United States','Japan','Russia','Canada']
def feed_time(year,kind='Number-of-processors'):
    global ts_proc
    source1='TOP500_'+str(year)+'06_all.xml'
    source2='TOP500_'+str(year)+'11_all.xml'
    df1=time_data(source1)
    df2=time_data(source2)    
    tot1=df1[kind].sum()/100.
    tot2=df2[kind].sum()/100.
    Classmt_Country=df1.groupby('Country').sum()/tot1
    row=dict(Classmt_Country[kind][main_zone])
    row_y = pd.Series(row)
    row_y.name = year    
    ts_proc=ts_proc.append(row_y)    
    Classmt_Country=df2.groupby('Country').sum()/tot2
    row=dict(Classmt_Country[kind][main_zone])
    row_y = pd.Series(row)
    row_y.name = year+0.5    
    ts_proc=ts_proc.append(row_y)
    
ts_proc= pd.DataFrame(columns=main_zone)
ts_flops= pd.DataFrame(columns=main_zone)
time=range(2007,2017)

for year in time:
    print year
    feed_time(year,kind='R-peak')

ts_proc.plot(figsize=(10,5)).legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xticks(time,(str(t) for t in time))
plt.ylabel('\% of R-peak Flops')
plt.savefig('Rpeak_history.png',bbox_inches='tight',dpi=120)

from IPython.core.display import HTML
style=open('notebooks.css', "r").read()
HTML(style)

Retour en haut de la page

-- Introduction au Python scientifique --

Exploitation des données de Top500.org

Utilisation de Pandas et etree pour tracer des données xml

simon.marie@lecnam.net

Exploitation des données
de Top500.org