#coding = utf-8
import pandas as pd
import numpy as np
import matplotlib as pltdates = pd.date_range('20170601', periods=6)
# make a random 6*4 matrix
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))print df# statistic basics. exclude missing data in general
# mean. mean of cols as default
print df.mean()
'''
A -0.640908
B -0.216183
C 0.316962
D -0.634263
dtype: float64
'''
# mean of rows
print df.mean(1)# move down
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
#print s
'''
2017-06-01 NaN
2017-06-02 NaN
2017-06-03 1.0
2017-06-04 3.0
2017-06-05 5.0
2017-06-06 NaN
Freq: D, dtype: float64
'''# df-s. pandas will make Series into DataFrame
# df will change
print df
print df.sub(s, axis='index')# cumulate by rows. default is by cols
print df.apply(np.cumsum, axis=1)
# apply lambda
print df.apply(lambda x: x.max() - x.min())s = pd.Series(np.random.randint(0, 7, size=10))
# there are duplicate values
# value_counts behaves like histogram
print s.value_counts()# string methods
# s.str.lower() means to lowercaseprint df
# first 3 rows. index by rows in default
print df[:3]# concat. use list as parameter
pieces = [df[:3], df[4:]]
print pd.concat(pieces)# join.
left = pd.DataFrame({'key':['1', '2'], 'lvar':['leftVar1', 'leftVar2']})
right = pd.DataFrame({'key':['1', '2'], 'rvar':['rightVar1', 'rightVar2']})
print left
print right
# merge by same key value
print pd.merge(left, right, on='key')
'''
key lvar rvar
0 1 leftVar1 rightVar1
1 2 leftVar2 rightVar2
'''# append. add a row to the tail
# ignore_index = False, the index will be appended too. If True, then all index will be 0...n (int)
print df.append(df.iloc[3], ignore_index=False)# group
df1 = pd.DataFrame({'A' : ['f', 'b', 'f', 'f', 'b'],
'B' : ['1', '2', '2', '1', '2'],
'C' : np.random.randn(5)})
# use sum()
print df1.groupby(['A', 'B']).sum()# stack unstack means transformation between matrix and DataFrame# pivot_table means group by index and cols, use values. if there's function, execute it
# pivot_table(df, values='D', index=['A', 'B'], columnes=['C'])# time series for time# categoricals# declare as category
s1 = pd.Series(['A', 'B', 'B', 'C', 'A', 'E']).astype("category")
# set category. Must same number of unique levels
s1.cat.categories = ["good", "bad", 'A', 'B']
print s1
# df.sort_values(by="categoryName")
# df.groupby("categoryName").size()# plot
df2 = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
df2 = df2.cumsum()
# four lines, four colors. with legend.
df2.plot()
#plt.pyplot.show()# file in & out
df2.to_csv("df2.csv")df3 = pd.read_csv("df2.csv")
print df3.head(3)#df2.to_hdf("df2.h5", 'df')
#pd.read_hdf('df2.h5', 'df')# need module openpyxl...
df2.to_excel('df2.xlsx', sheet_name='sheet1')
pd.read_excel('df2.xlsx', 'sheet1', index_col=None, na_values=['NA'])