快乐学习
前程无忧、中华英才非你莫属!

Pandas进阶



###############第1课: 从CSV读取 - 导出到CSV - 查找最大值 - 绘制数据##############


from pandas import DataFrame, read_csv

import matplotlib.pyplot as plt
import pandas as pd
import sys
import matplotlib
import numpy as np


print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)

# 创建数据
# 数据集将包括5个宝宝的名字和当年记录的出生人数(1880年)。
# 初始宝宝的名字和出生率
names = ['Bob','Jessica','Mary','John','Mel']
births = [968, 155, 77, 578, 973]

# 为了将这两个列表合并在一起,我们将使用zip函数。

BabyDataSet = list(zip(names,births))

BabyDataSet

df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df

df.to_csv('births1880.csv',index=False,header=False)
Location = r'E:PythonProjectbirths1880.csv'


df = pd.read_csv(Location)
df

df = pd.read_csv(Location, names=['Names','Births'])
df

import os
os.remove(Location)


df.dtypes
df.Births.dtype


Sorted = df.sort_values(['Births'], ascending=False)
Sorted.head(1)

df['Births'].max()



# 解释这些部分:
# df ['Names'] - 这是婴儿名字的整个列表,整个名字列
# df ['Births'] - 这是1880年出生的整个列表,整个Births列
# df ['Births'].max() - 这是出生列中找到的最大值
#
# [df ['Births'] == df ['Births'].max()] IS EQUAL TO [查找出生列中等于973的所有记录]
# df ['Names'] [df [Births'] == df ['Births']。max()] 是等于选择Names列中的所有记录WHERE [Births列等于973]


# Create graph
df['Births'].plot()
# Maximum value in the data set
MaxValue = df['Births'].max()
# Name associated with the maximum value
MaxName = df['Names'][df['Births'] == df['Births'].max()].values
# Text to display on graph
Text = str(MaxValue) + " - " + MaxName
# Add text to graph
plt.annotate(Text, xy=(1, MaxValue), xytext=(8, 0), xycoords=('axes fraction', 'data'), textcoords='offset points')
print("The most popular name")
df[df['Births'] == df['Births'].max()]
#Sorted.head(1) can also be used


###############2课:从TXT读取 - 导出到TXT - 选择顶部/底部记录 - 描述性统计 - 分组/排序数据##############



# 1、准备数据
names = ['Bob','Jessica','Mary','John','Mel']
np.random.seed(500)
random_names = [names[np.random.randint(low=0,high=len(names))] for i in range(1000)]

random_names[:10]

# The number of births per name for the year 1880
births = [np.random.randint(low=0,high=1000) for i in range(1000)]
births[:10]

BabyDataSet = list(zip(random_names,births))
BabyDataSet[:10]

df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df[:10]

df.to_csv('births1880.txt',index=False,header=False)

Location = r'E:PythonProjectbirths1880.txt'
df = pd.read_csv(Location)
df.info()
df.head()

df = pd.read_csv(Location, header=None)
df.info()
df.tail()
df = pd.read_csv(Location, names=['Names','Births'])
df.head(5)
import os
os.remove(Location)



# 唯一属性来查找“名称”列的所有唯一记录
# Method 1:
df['Names'].unique()


# If you actually want to print the unique values:
for x in df['Names'].unique():
    print(x)


# Method 2:
print(df['Names'].describe())

# Create a groupby object

# 由于每个宝贝名称都有多个值,因此我们需要汇总这些数据
name = df.groupby('Names')

# Apply the sum function to the groupby object
df = name.sum()
df


# Method 1:
Sorted = df.sort_values(['Births'], ascending=False)
Sorted.head(1)
# Method 2:
df['Births'].max()


# Create graph
df['Births'].plot.bar()

print("The most popular name")
df.sort_values(by='Births', ascending=False)


##############第三课、创建功能 - 从EXCEL读取 - 导出到EXCEL - 离群值 - Lambda函数 - 切片和切块数据


# set seed
np.random.seed(111)


# Function to generate test data
def CreateDataSet(Number=1):
    Output = []

    for i in range(Number):
        # Create a weekly (mondays) date range
        rng = pd.date_range(start='1/1/2009', end='12/31/2012', freq='W-MON')

        # Create random data
        data = np.random.randint(low=25, high=1000, size=len(rng))

        # Status pool
        status = [1, 2, 3]

        # Make a random list of statuses
        random_status = [status[np.random.randint(low=0, high=len(status))] for i in range(len(rng))]

        # State pool
        states = ['GA', 'FL', 'fl', 'NY', 'NJ', 'TX']

        # Make a random list of states
        random_states = [states[np.random.randint(low=0, high=len(states))] for i in range(len(rng))]

        Output.extend(zip(random_states, random_status, data, rng))

    return Output

dataset = CreateDataSet(4)
df = pd.DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate'])
df.info()
df.head()


df.to_excel('Lesson3.xlsx', index=False)
print('Done')
# Location of file
Location = r'E:PythonProjectLesson3.xlsx'

# Parse a specific sheet
df = pd.read_excel(Location, 0, index_col='StatusDate')
df.dtypes

# Parse a specific sheet
df = pd.read_excel(Location, 0, index_col='StatusDate')
df.dtypes
df.index

df.head()
# 保留数组中不同的值
df['State'].unique()
# 要将所有状态值转换为大写
df['State'] = df.State.apply(lambda x: x.upper())
df['State'].unique()

# Only grab where Status == 1
mask = df['Status'] == 1
df = df[mask]
# Convert NJ to NY
mask = df.State == 'NJ'
df['State'][mask] = 'NY'


df['State'].unique()

df['CustomerCount'].plot(figsize=(15,5));


sortdf = df[df['State']=='NY'].sort_index(axis=0)
sortdf.head(10)

# Group by State and StatusDate
Daily = df.reset_index().groupby(['State','StatusDate']).sum()
Daily.head()

Daily.index

# Select the State index
Daily.index.levels[0]

Daily.index.levels[1]


Daily.loc['FL'].plot()
Daily.loc['GA'].plot()
Daily.loc['NY'].plot()
Daily.loc['TX'].plot()


Daily.loc['FL']['2012':].plot()
Daily.loc['GA']['2012':].plot()
Daily.loc['NY']['2012':].plot()
Daily.loc['TX']['2012':].plot()



StateYearMonth = Daily.groupby([Daily.index.get_level_values(0), Daily.index.get_level_values(1).year, Daily.index.get_level_values(1).month])
Daily['Lower'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.25) - (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Upper'] = StateYearMonth['CustomerCount'].transform( lambda x: x.quantile(q=.75) + (1.5*x.quantile(q=.75)-x.quantile(q=.25)) )
Daily['Outlier'] = (Daily['CustomerCount'] < Daily['Lower']) | (Daily['CustomerCount'] > Daily['Upper'])

# Remove Outliers
Daily = Daily[Daily['Outlier'] == False]
Daily.head()



# Combine all markets

# Get the max customer count by Date
ALL = pd.DataFrame(Daily['CustomerCount'].groupby(Daily.index.get_level_values(1)).sum())
ALL.columns = ['CustomerCount'] # rename column

# Group by Year and Month
YearMonth = ALL.groupby([lambda x: x.year, lambda x: x.month])

# What is the max customer count per Year and Month
ALL['Max'] = YearMonth['CustomerCount'].transform(lambda x: x.max())
ALL.head()


# Create the BHAG dataframe
data = [1000,2000,3000]
idx = pd.date_range(start='12/31/2011', end='12/31/2013', freq='A')
BHAG = pd.DataFrame(data, index=idx, columns=['BHAG'])
BHAG

# Combine the BHAG and the ALL data set
combined = pd.concat([ALL,BHAG], axis=0)
combined = combined.sort_index(axis=0)
combined.tail()

fig, axes = plt.subplots(figsize=(12, 7))

combined['BHAG'].fillna(method='pad').plot(color='green', label='BHAG')
combined['Max'].plot(color='blue', label='All Markets')
plt.legend(loc='best')


# Group by Year and then get the max value per year
Year = combined.groupby(lambda x: x.year).max()
Year


# Add a column representing the percent change per year
Year['YR_PCT_Change'] = Year['Max'].pct_change(periods=1)
Year

(1 + Year.ix[2012,'YR_PCT_Change']) * Year.loc[2012,'Max']




# First Graph
ALL['Max'].plot(figsize=(10, 5));plt.title('ALL Markets')

# Last four Graphs
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 10))
fig.subplots_adjust(hspace=1.0) ## Create space between plots

Daily.loc['FL']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,0])
Daily.loc['GA']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[0,1])
Daily.loc['TX']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,0])
Daily.loc['NY']['CustomerCount']['2012':].fillna(method='pad').plot(ax=axes[1,1])

# Add titles
axes[0,0].set_title('Florida')
axes[0,1].set_title('Georgia')
axes[1,0].set_title('Texas')
axes[1,1].set_title('North East')



############第四课: - 添加/删除列 - 索引操作###############


# Our small data set
d = [0,1,2,3,4,5,6,7,8,9]

# Create dataframe
df = pd.DataFrame(d)
df

# Lets change the name of the column
df.columns = ['Rev']
df

# Lets add a column
df['NewCol'] = 5
df

# Lets modify our new column
df['NewCol'] = df['NewCol'] + 1
df


del df['NewCol']
df

# Lets add a couple of columns
df['test'] = 3
df['col'] = df['Rev']
df


# If we wanted, we could change the name of the index
i = ['a','b','c','d','e','f','g','h','i','j']
df.index = i
df


df.loc['a']


# df.loc[inclusive:inclusive]
df.loc['a':'d']


# df.iloc[inclusive:exclusive]
# Note: .iloc is strictly integer position based. It is available from [version 0.11.0] (http://pandas.pydata.org/pandas-docs/stable/whatsnew.html#v0-11-0-april-22-2013)
df.iloc[0:3]


df['Rev']

df[['Rev', 'test']]

# df.ix[rows,columns]
# replaces the deprecated ix function
#df.ix[0:3,'Rev']
df.loc[df.index[0:3],'Rev']



# replaces the deprecated ix function
#df.ix[5:,'col']
df.loc[df.index[5:],'col']


# replaces the deprecated ix function
#df.ix[:3,['col', 'test']]
df.loc[df.index[:3],['col', 'test']]


# Select top N number of records (default = 5)
df.head()


# Select bottom N number of records (default = 5)
df.tail()


##########第五课、堆栈/取消堆栈/移调功能########



# Our small data set
d = {'one':[1,1],'two':[2,2]}
i = ['a','b']

# Create dataframe
df = pd.DataFrame(data = d, index = i)
df
df.index
stack = df.stack()
stack
stack.index

unstack = df.unstack()
unstack
unstack.index

transpose = df.T
transpose
transpose.index


#################第六课,gruop by###################


# Our small data set
d = {'one':[1,1,1,1,1],
     'two':[2,2,2,2,2],
     'letter':['a','a','b','b','c']}

# Create dataframe
df = pd.DataFrame(d)
df
# Create group object
one = df.groupby('letter')

# Apply sum function
one.sum()
letterone = df.groupby(['letter','one']).sum()
letterone
letterone.index
letterone = df.groupby(['letter','one'], as_index=False).sum()
letterone
letterone.index


#################第7课,计算离群值的方法###################



# Create a dataframe with dates as your index
States = ['NY', 'NY', 'NY', 'NY', 'FL', 'FL', 'GA', 'GA', 'FL', 'FL']
data = [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10]
idx = pd.date_range('1/1/2012', periods=10, freq='MS')
df1 = pd.DataFrame(data, index=idx, columns=['Revenue'])
df1['State'] = States

# Create a second dataframe
data2 = [10.0, 10.0, 9, 9, 8, 8, 7, 7, 6, 6]
idx2 = pd.date_range('1/1/2013', periods=10, freq='MS')
df2 = pd.DataFrame(data2, index=idx2, columns=['Revenue'])
df2['State'] = States
# Combine dataframes
df = pd.concat([df1,df2])
df

# Method 1

# make a copy of original df
newdf = df.copy()

newdf['x-Mean'] = abs(newdf['Revenue'] - newdf['Revenue'].mean())
newdf['1.96*std'] = 1.96*newdf['Revenue'].std()
newdf['Outlier'] = abs(newdf['Revenue'] - newdf['Revenue'].mean()) > 1.96*newdf['Revenue'].std()
newdf


# Method 2
# Group by item

# make a copy of original df
newdf = df.copy()

State = newdf.groupby('State')

newdf['Outlier'] = State.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() )
newdf['x-Mean'] = State.transform( lambda x: abs(x-x.mean()) )
newdf['1.96*std'] = State.transform( lambda x: 1.96*x.std() )
newdf

# Method 2
# Group by multiple items

# make a copy of original df
newdf = df.copy()

StateMonth = newdf.groupby(['State', lambda x: x.month])

newdf['Outlier'] = StateMonth.transform( lambda x: abs(x-x.mean()) > 1.96*x.std() )
newdf['x-Mean'] = StateMonth.transform( lambda x: abs(x-x.mean()) )
newdf['1.96*std'] = StateMonth.transform( lambda x: 1.96*x.std() )
newdf

# Method 3
# Group by item

# make a copy of original df
newdf = df.copy()

State = newdf.groupby('State')

def s(group):
    group['x-Mean'] = abs(group['Revenue'] - group['Revenue'].mean())
    group['1.96*std'] = 1.96*group['Revenue'].std()
    group['Outlier'] = abs(group['Revenue'] - group['Revenue'].mean()) > 1.96*group['Revenue'].std()
    return group

Newdf2 = State.apply(s)
Newdf2

# make a copy of original df
newdf = df.copy()

State = newdf.groupby('State')

newdf['Lower'] = State['Revenue'].transform( lambda x: x.quantile(q=.25) - (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) )
newdf['Upper'] = State['Revenue'].transform( lambda x: x.quantile(q=.75) + (1.5*(x.quantile(q=.75)-x.quantile(q=.25))) )
newdf['Outlier'] = (newdf['Revenue'] < newdf['Lower']) | (newdf['Revenue'] > newdf['Upper'])
newdf


################第八课: 从MySQL数据库读取###################3

from sqlalchemy import create_engine, MetaData, Table, select, engine


# Parameters
TableName = "employees"
# Create the connection
engine=create_engine("mysql+pymysql://root:root@localhost:3306/test")
conn = engine.connect()

# Required for querying tables
metadata = MetaData(conn)

# Table to query
tbl = Table(TableName, metadata, autoload=True)
#tbl.create(checkfirst=True)

# Select all
sql = tbl.select()

# run sql code
result = conn.execute(sql)

result

# Insert to a dataframe
df = pd.DataFrame(data=list(result), columns=result.keys())
df
# Close connection
conn.close()

print('Done')


# 第九课: - 导出为CSV / EXCEL / TXT

df.to_csv('DimDate.csv', index=False)
print('Done')

df.to_excel('DimDate.xls', index=False)
print('Done')

df.to_csv('DimDate.txt', index=False)
print('Done')


# 第十课: - 在不同格式之间转换


# Create DataFrame
d = [1,2,3,4,5,6,7,8,9]
df = pd.DataFrame(d, columns = ['Number'])
df

# Export to Excel
df.to_excel('Lesson10.xlsx', sheet_name = 'testing', index = False)
print('Done')
# Path to excel file
# Your path will be different, please modify the path below.
location = r'E:PythonProjectLesson10.xlsx'

# Parse the excel file
df = pd.read_excel(location, 0)
df.head()

df.dtypes

df.tail()

df.to_json('Lesson10.json')
print('Done')

# Your path will be different, please modify the path below.
jsonloc = r'E:PythonProjectLesson10.json'

# read json file
df2 = pd.read_json(jsonloc)

df2.dtypes


# 第十一课: - 结合各种来源的数据

# Create DataFrame
d = {'Channel':[1], 'Number':[255]}
df = pd.DataFrame(d)
df

# Export to Excel

df.to_excel('test1.xlsx', sheet_name = 'test1', index = False)
df.to_excel('test2.xlsx', sheet_name = 'test2', index = False)
df.to_excel('test3.xlsx', sheet_name = 'test3', index = False)
print('Done')

# List to hold file names
FileNames = []

# Your path will be different, please modify the path below.
os.chdir(r"E:PythonProject")



# Find any file that ends with ".xlsx"
for files in os.listdir("."):
    if files.endswith(".xlsx"):
        FileNames.append(files)

FileNames


def GetFile(fnombre):
    # Path to excel file
    # Your path will be different, please modify the path below.
    location = r'E:PythonProject\' + fnombre

    # Parse the excel file
    # 0 = first sheet
    df = pd.read_excel(location, 0)

    # Tag record to file name
    df['File'] = fnombre

    # Make the "File" column the index of the df
    return df.set_index(['File'])

# Create a list of dataframes
df_list = [GetFile(fname) for fname in FileNames]
df_list


# Combine all of the dataframes into one
big_df = pd.concat(df_list)
big_df

big_df.dtypes

# Plot it!
big_df['Channel'].plot.bar()


# Python实用的数据分析

# 1、数据传输
import numpy as np
import pandas as pd


ver=pd.read_csv("ver.csv")

pd.set_option('display.max_columns', 80)
ver.head(3)
ver.shape
len(ver)
ver.columns
ver['action_taken'][:5]
incomeranges = pd.cut(ver['applicant_income_000s'], 14)
incomeranges[:5]
pd.value_counts(incomeranges)
ver.ix[0,0:6]
ver['loan_amount_000s'].order()[:5]

sorteddata = ver.sort(['loan_amount_000s'])
sorteddata.ix[:,0:6].head(3)

sorteddata.iloc[0:3,0:3]
ver['action_taken_name'].value_counts()

zip(ver.columns, [type(x) for x in ver.ix[0,:]])
ver.dtypes
ver['county_name'].unique()
len(ver['county_name'].unique())
ver.ix[0:3,'preapproval_name']
ver.ix[0:3,'preapproval_name'] == "Preapproval was requested"


# 2、汇总数据

ver=pd.read_csv("ver.csv")
melt = pd.melt(ver, id_vars = 'loan_purpose_name')
melt.iloc[0:5,:]
melt = pd.melt(ver, id_vars = 'county_name')
melt.iloc[0:5,:]
ver.describe()
pd.crosstab(ver['county_name'],ver['action_taken_name'])
incomesubset = ver[(ver['applicant_income_000s'] > 0 ) & (ver['applicant_income_000s'] < 1000)]
incomesubset

incomesubset.shape

qry1 = ver.query('(applicant_income_000s > 0) & (applicant_income_000s < 1000)')
qry1.head(10)

qry1.shape

grouped1 = ver.groupby(['applicant_race_name_1','loan_purpose_name']).mean()
grouped1


grouped1["action_taken_name"].unique()

打赏

未经允许不得转载:同乐学堂 » Pandas进阶

分享到:更多 ()

评论 抢沙发

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

特别的技术,给特别的你!

联系QQ:1071235258QQ群:226134712
error: Sorry,暂时内容不可复制!