import matplotlib
matplotlib.style.use('seaborn-poster')
matplotlib.style.use('fivethirtyeight')
import xml.etree.ElementTree as ET
import pandas as pd
%matplotlib inline
On s'interesse d'abord à la dernière liste publiée sur top500.org
year=2016
mon=11
source1='TOP500_'+str(year)+str(mon)+'_all.xml'
def top_data(source):
tree = ET.parse(source).getroot()
namespace={'top500': 'http://www.top500.org/xml/top500/1.0'}
df = pd.DataFrame(columns=('Rank','System-name','Computer', 'Country','State','City','R-max', 'R-peak', 'Number-of-processors','Power'))
Europe=['Switzerland','France','Germany','Italy','Spain','United Kingdom','Sweden','Netherlands','Ireland','Belgium','Norway','Denmark','Finland','Austria','Czech Republic','Poland']
for site in tree:
if site.find('top500:power',namespace).text==None:
site.find('top500:power',namespace).text='0'
if site.find('top500:country',namespace).text in Europe:
site.find('top500:state',namespace).text=site.find('top500:country',namespace).text
site.find('top500:country',namespace).text='Europe'
row = dict(zip(['Rank','System-name','Computer', 'Country','State','City','R-max', 'R-peak', 'Number-of-processors','Power'],
[site.find('top500:rank',namespace).text,site.find('top500:system-name',namespace).text,
site.find('top500:computer',namespace).text,site.find('top500:country',namespace).text,
site.find('top500:state',namespace).text,site.find('top500:town',namespace).text,
float(site.find('top500:r-max',namespace).text)/1e6,
float(site.find('top500:r-peak',namespace).text)/1e6,
float(site.find('top500:number-of-processors',namespace).text)/1e6,
float(site.find('top500:power',namespace).text)]))
row_s = pd.Series(row)
row_s.name = site[1].text
df = df.append(row_s)
return df
df=top_data(source1)
df[:10][['Rank','Country','Computer','System-name','R-peak','Number-of-processors','Power']].to_csv('top10.csv')
df[df['Country']=='Europe'][['Rank','State','City','System-name','R-peak','Number-of-processors','Power']][:10].to_csv('top10_eu.csv')
df[df['State'] == 'France'][:10].to_csv('top10_fr.csv')
print df[:10][['Rank','Country','System-name','R-peak','Power']]
print df[df['Country']=='Europe'][['Rank','State','System-name','R-peak','Power']][:10]
print df[df['State']=='France'][['Rank','City','System-name','R-peak','Power']][:10]
On peut représenter ça de façon synthétique avec des histogrammes:
main_zone=['Europe','China','United States','Japan','Russia','Canada','India','Saudi Arabia']
Classmt_Country=df.groupby('Country').sum()
Classmt_Country=df.groupby('Country').sum().sort_values('Number-of-processors')
Classmt_Country['Number-of-processors'][-10:].plot(kind='barh')
plt.xlabel('Processors in Million')
plt.savefig('Procs_'+str(year)+'_'+str(mon)+'.png',bbox_inches='tight',dpi=120)
Classmt_Country['R-peak'][-10:].plot(kind='barh')
plt.xlabel('R-peak in Peta-Flops')
plt.savefig('Rpeak_'+str(year)+'_'+str(mon)+'.png',bbox_inches='tight',dpi=120)
Classmt_Country['R-max'][-10:].plot(kind='barh')
plt.xlabel('R-max in PFlops')
plt.savefig('Rmax_'+str(year)+'_'+str(mon)+'.png',bbox_inches='tight',dpi=120)
Maintenant on va s'intéresser à l'évolution des classements dans le temps:
def time_data(source):
tree = ET.parse(source).getroot()
namespace={'top500': 'http://www.top500.org/xml/top500/1.0'}
df = pd.DataFrame(columns=('Rank','Country','R-max', 'R-peak','Number-of-processors'))
Europe=['Switzerland','France','Germany','Italy','Spain','United Kingdom','Sweden','Netherlands','Ireland','Belgium','Norway','Denmark','Finland','Austria','Czech Republic','Poland']
for site in tree:
if site.find('top500:country',namespace).text in Europe:
site.find('top500:country',namespace).text='Europe'
row = dict(zip(['Rank','Country','R-max', 'R-peak','Number-of-processors'],
[site.find('top500:rank',namespace).text,site.find('top500:country',namespace).text,
float(site.find('top500:r-max',namespace).text)/1e6,
float(site.find('top500:r-peak',namespace).text)/1e6,
float(site.find('top500:number-of-processors',namespace).text)/1e6]))
row_s = pd.Series(row)
row_s.name = site[1].text
df = df.append(row_s)
return df
main_zone=['Europe','China','United States','Japan','Russia','Canada']
def feed_time(year,kind='Number-of-processors'):
global ts_proc
source1='TOP500_'+str(year)+'06_all.xml'
source2='TOP500_'+str(year)+'11_all.xml'
df1=time_data(source1)
df2=time_data(source2)
tot1=df1[kind].sum()/100.
tot2=df2[kind].sum()/100.
Classmt_Country=df1.groupby('Country').sum()/tot1
row=dict(Classmt_Country[kind][main_zone])
row_y = pd.Series(row)
row_y.name = year
ts_proc=ts_proc.append(row_y)
Classmt_Country=df2.groupby('Country').sum()/tot2
row=dict(Classmt_Country[kind][main_zone])
row_y = pd.Series(row)
row_y.name = year+0.5
ts_proc=ts_proc.append(row_y)
ts_proc= pd.DataFrame(columns=main_zone)
ts_flops= pd.DataFrame(columns=main_zone)
time=range(2007,2017)
for year in time:
print year
feed_time(year,kind='R-peak')
ts_proc.plot(figsize=(10,5)).legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.xticks(time,(str(t) for t in time))
plt.ylabel('\% of R-peak Flops')
plt.savefig('Rpeak_history.png',bbox_inches='tight',dpi=120)
from IPython.core.display import HTML
style=open('notebooks.css', "r").read()
HTML(style)