0


python数据分析—— pandas

python数据分析—— pandas

1.模块导入

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

2. Series对象的创建和索引

se = Series([value for value inrange(5,10)],index =[value for value inrange(1,6)])
se.values # 获取内容
se.index  # 获取索引
se.items 
list(se.iteritems())#获取索引+内容的键值对dict={'a':1,'b':3,"c":555}
se = Series(dict)# 通过字典创建Series对象# 获取数据
se['b']# 索引标签获取
se[1]# 位置标签获取

se['a':'c']# 索引标签左右都包含
se[0:2]# 位置标签左包含右不包含# 修改索引值
se.index =list('bnm')# 丢弃指定轴上的数值
se.drop(['b'])# Series进行算术运算操作"""
    两个Series之间的计算是根据索引进行的
    也支持np的函数
"""
series1 = Series(range(1,4),index =list('abc'))
series2 = Series(range(100,103),index =list('cab'))
series1 + series2
series1**2
np.sqrt(series1)

3 DataFrame的创建及相关属性

# DataFrame对象的创建
df1 = DataFrame(np.random.randint(1,16,(3,5)),index =[value for value inrange(1,4)],
                columns =[value for value inrange(1,6)])# 使用数组创建
df2 = DataFrame({'烷烃':[value for value in np.random.randint(1,100,5)],'烯烃':[value for value in np.random.randint(1,100,5)],'芳烃':[value for value in np.random.randint(1,100,5)],})# 通过字典创建

df3 = DataFrame({'烷烃':Series([value for value in np.random.randint(1,100,4)],index =list("abce")),'烯烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),'芳烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),})# 通过字典创建# DataFrame 对象转化为字典
df3.to_dict()# DataFrame对象常用属性
df = DataFrame({'烷烃':Series([value for value in np.random.randint(1,100,4)],index =list("abce")),'烯烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),'芳烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),})
df.shape # 获取行数和列数 以元组形式返回
df.index.tolist()# 获取行索引
df.columns.tolist()# 获取列索引
df.dtypes # 获取数据类型
df.ndim #获取维度
df.values  # 获取数据 返回数组
df.info()# df的概览
df.head(2)# 获取前几行数据 默认为5
df.tail(3)# 获取后几行数据 默认为5# 获取DataFrame的列
df["烷烃"]# 返回数据类型为Series
df[["烷烃","烯烃"]]# 获取DataFrame的行
df["a":"a"]
df[0:1]

df["a":"c"]# 标签索引和位置索引
df.loc[["a","c"],"烷烃":"芳烃"]# 标签索引具体数据
df.iloc[0:2,2]# 位置索引数据# 修改值
df.loc["a","烯烃"]# 排序
df.sort_values(by ="烯烃", ascending =False)

4 DataFrame修改索引、添加数据及删除数据

4.1 DataFrame修改index columns

df = DataFrame({'烷烃':Series([value for value in np.random.randint(1,100,4)],index =list("abce")),'烯烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),'芳烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),})# 直接替换
df.index =list("jsais")# 自定义函数defmap(x):return x+'wj'
df.rename(index =map,columns =map,inplace =False)# inplace = True 在原来基础上修改  inplace = True 不在原来基础上修改
df.rename(index ={"j":"wj"},columns ={"烷烃":"不饱和烃"},inplace =True)# 列转化为索引
df.set_index("不饱和烃",drop ="True")
df.index.name =None

4.2 添加数据

df = DataFrame({'烷烃':Series([value for value in np.random.randint(1,100,4)],index =list("abce")),'烯烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),'芳烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),})# 在最后插入一列
df["密度"]=[value for value in np.random.randint(700,800,5)]# 在指定位置插入一列
df.insert(0,"溴值",[value for value in np.random.randint(20,80,5)])# 替换一行
df.iloc[1]=[value for value in np.random.randint(20,80,5)]# 增加一行
df.append(DataFrame({"溴值":5,"烷烃":10,"烯烃":66,"芳烃":9888,"密度":66},index =["j"]))# 合并
df1 = DataFrame({"语文":Series([i for i in np.random.randint(1,100,20)],index =range(1,21)),"数学":Series([i for i in np.random.randint(1,100,20)],index =range(1,21)),})

df2 = DataFrame({"英语":Series([i for i in np.random.randint(1,100,20)],index =range(22,42)),"化学":Series([i for i in np.random.randint(1,100,20)],index =range(22,42)),})

pd.concat([df1,df2],axis =0)# 按列连接
pd.concat([df1,df2],axis =1)# 按行连接

4.3 删除数据

df = DataFrame({'烷烃':Series([value for value in np.random.randint(1,100,4)],index =list("abce")),'烯烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),'芳烃':Series([value for value in np.random.randint(1,100,5)],index =list("abcde")),})
df.drop(["烷烃"],axis =1, inplace =False)

df.drop(["a"],axis =0, inplace =False)

5 数据处理

import pandas as pd 
from pandas import Series,DataFrame
import numpy as np
from numpy import nan as NaN

se1 = Series([value for value in np.random.randint(1,100,10)],index = np.arange(1,11))
se2 = Series([value for value in np.random.randint(555,10000,10)],index = np.arange(1,11))

df1 = DataFrame({"数学":Series([80+value*10for value in np.random.randn(10)],index = np.arange(1,11)),"语文":Series([75+value*8for value in np.random.randn(10)],index =[ i for i in np.arange(1,16)if(i notin[5,8,9,14,3])]),"英语":Series([75+value*8for value in np.random.randn(10)],index =[ i for i in np.arange(1,16)if(i notin[2,8,4,5,14])]),"理综":Series([75+value*8for value in np.random.randn(10)],index =[ i for i in np.arange(1,16)if(i notin[9,7,3,6,14])])})

df1.append(DataFrame({"数学":NaN,"语文":NaN,"理综":NaN,"英语":NaN,},index =[14]),sort=False)
http://localhost:8888/notebooks/python/data%20miniing/wj_py/ipynb/pandas.ipynb## 过滤缺失数据
df1.dropna()# 默认滤除所有包含nan
df1.dropna(how ="all")# 默认how = "any"  只有一行全是nan才会过滤
df1.dropna(how ="all",axis =1)# 滤除列 默认how = "any"  只有一行全是nan才会过滤
df1.dropna(thresh =2)#保留至少有2个非nan数据的行
 
df1.isnull()
df1.notnull()
df1[df1.notnull()]# 填充缺失数据
df1.fillna(0,inplace =False)# 用常数填充nan
df1.fillna({"数学":60,"英语":70})# 用字典给不同的列填充不同的值
df1.loc[:,"数学"].fillna(50)#只填充某列# 改变填充方式 method
df1.fillna(method ="ffill",axis =0)# 上下填充
df1.fillna(method ="bfill",axis =1)# 左右填充
df1.fillna(method ="ffill",axis =0,limit =1)# 限制填充个数# 移除重复数据
df1 = df1.append(df1.loc[2,:])
df1.duplicated()# 判断某一行是否重复出现  返回bool值
df1.drop_duplicates()# 去除全部的重复行
df1.drop_duplicates(["语文"])# 按指定列去除重复行
df1.drop_duplicates(["语文","英语"],keep ="last")# 保留重复行的最后一行
df1.drop_duplicates(["语文"],inplace =False)# 是否改变原对象

6 数据合并

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

df1 = DataFrame((60+10*np.random.randn(50)).reshape(10,5),index = np.arange(1,11),
                columns =["math","English","chemisry","physics","biology"])
df1.index.name ="编号"
df1.columns.name ="成绩"

df2 = DataFrame((70+8*np.random.randn(18)).reshape(6,3),index = np.arange(8,14),
                columns =["P.E.","history","politics"])# join方法连接  根据行索引来连接
df1.join(df2,how ="left")# 默认左连接
df1.join(df2,how ="right")
df1.join(df2,how ="outer")# 全部连接# merge 方法连接
df1 = DataFrame((60+10*np.random.randn(50)).reshape(10,5),index = np.arange(1,11),
                columns =["math","English","chemisry","physics","biology"])
df1.insert(0,"name",["A","B","C","D","E","G","H","I","J","K"])
df1.index.name ="编号"
df1.columns.name ="成绩"

df2 = DataFrame((70+8*np.random.randn(18)).reshape(6,3),index = np.arange(1,7),
                columns =["P.E.","history","politics"])
df2.insert(0,"name",["A","B","C","M","N","O"])

pd.merge(df1,df2,how ="inner")# 默认为inner,根据左右对象中出现同名的列作为连接的键
pd.merge(df1,df2,on ="name", how ="inner")# 指定列名合并
pd.merge(df1,df2,on ="name", how ="left") 
pd.merge(df1,df2,on ="name", how ="right") 
pd.merge(df1,df2,on ="name", how ="outer")# 所有# 根据多列连接  pd.merge(df1,df2,on = ["name","math"])

7 多层索引

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

# series创建多层索引
se1 = Series([value for value in np.arange(1,7)],
             index =[["math","math","English","English","History","History"],["midterm","end of a term","midterm","end of a term","midterm","end of a term"]])# datafeame创建多层索引
df1 = DataFrame(np.arange(1,19).reshape(6,3),
               index =[["math","math","English","English","History","History"],["midterm","end of a term","midterm","end of a term","midterm","end of a term"]],
               columns =list("abc"))# product构造
class1 =["math","History","English"]
class2 =["midterm","end of a term"]
m_index = pd.MultiIndex.from_product([class1,class2])
df2 = DataFrame(np.arange(1,19).reshape(6,3),
               index = m_index,
               columns =list("abc"))# 多层索引对象的索引 
se1["math"]# 一级索引
se1["math","midterm"]

df1.loc["math"]
df1.loc[["math","midterm"]]

8 时间序列

import pandas as pd
from pandas import Series,DataFrame
import numpy as np

# 生成一段时间范围"""
时间序列频率:D        日历日的每一天
              B        工作日的每一天
              H        每小时
              T/min    每分钟
              S        每秒
              L/ms     毫秒
              U       微秒
              M        日历日的月底日期
              BM       工作日的月底日期
              MS        日历日的月初日期
              BMS       工作日的月初日期
"""  

date = pd.date_range(start ="20190101",end ="20200203")# 以1d为频率生成连续时间序列
date = pd.date_range(start ="2019-02-03",end ="2019-03-05",periods =10)#periods 为时间个数
date = pd.date_range(start ="2020-03-01 08:00:00",end ="2020-04-05 00:00:00",freq  ="10min")# freq为时间间隔
data = pd.date_range(start ="2020-01-08 12:00:00",end ="2020-04-05 00:00:00",
                     periods =20,closed =None)# None包含开始和结束时间、"left"包含开始时间,"right"包含结束时间# **************时间序列在DataFrame中的作用
date = pd.date_range(start ="2020-01-01 08:00:00",end ="2028-12-31 08:00:00",periods =1000)
df = DataFrame(np.arange(5000).reshape((1000,5)),index = date)# 时间序列作为索引
df.truncate(before="2020-08")
df.truncate(after="2020-08")# 过滤前后的数据

df["2020"]# 根据年份获取数据
df["2020-05"]# 获取某月的数据
df["2020-05-01"]# 根据年份和日期获取
df["2020-08-08":"2021-01-01"]# 根据切片获取

df.between_time("09:30","18:00")#返回位于指定时间段的数据集#  ********************** 移位日期
date = pd.date_range(start ="2020-01-01 08:00:00",end ="2020-01-10 08:00:00",periods =10)
se = Series(np.arange(10),index = date)

se.shift(periods =2,freq ="d")# periods 日期偏移量  负向前偏移  freq:单位

se.tshift(10)# 移动指定日期# 将时间戳转化为时间根式
pd.to_datetime(1554970740000,unit ="ms")

pd.to_datetime(1554970740000,unit ="ms").tz_localize('UTC').tz_convert('Asia/Shanghai')# 处理中文
pd.to_datetime("2020年10月23日",format="%Y年%m月%d日")# ******** 日期索引使用实例
data = pd.read_excel("./output.xlsx")
data = data.set_index("time",drop =True)
data.index.name =None
data.index = pd.to_datetime(data.index)# 将每个周一08:00的数据拿出来
result = data[(data.index.weekday_name =="Monday")&(data.index.time == pd.to_datetime("08:00:00").time())]dir(data.index)

9 分组聚合

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

df = DataFrame({"name":Series(np.random.choice(["Jim","Tom","Cyrus","Bob","Smith"],100),index = np.arange(1,101)),"gender":Series(np.random.choice(["male","female"],100),index  = np.arange(1,101)),"salary":Series(np.random.randint(5000,18000,100),index = np.arange(1,101)),"Year":Series(np.random.choice([2015,2016,2017,2018,2019,2020],100),index = np.arange(1,101))})# 根据其中一列分组
group_name = df.groupby("name")# 查看分组情况
group_name.groups
group_name.count()#       查看每一组的具体情况# for name,group in group_name:#     print(name)#     print(group)
    
group_name.get_group("Bob")
df["salary"].groupby(df["name"])# 根据多列进行分组
group_mul = df.groupby(["name","Year"])# for name,group in group_mul:#     print(name,"\n",group)
group_mul.get_group(("Bob",2016))# 将某列数据按按数值分成不同范围进行分组
salary_group = pd.cut(df["salary"],bins =[5000,10000,15000,18000])
sg = df.groupby(salary_group)
sg.count()
pd.crosstab(salary_group,df["Year"])# 聚合"""
    聚合函数:
    mean
    count
    sum
    median
    std
    var
    min
    max
    prod #非nan的积
    first
    last
    mad
    mode
    abs
    
"""# 当分组后进行数值计算时 不是数值的列会被清除
df.groupby("name").sum()
df["salary"].groupby(df["name"]).mean()
df.groupby("name")["salary"].var()# 通过聚合函数
df.groupby("name").agg(["min","max","sum","std"])# 自定义聚合函数defvary(df):returnabs(df.min()-df.max())
df.groupby("name").agg(vary)
df.groupby("Year").agg(["sum","mean",vary])
df.groupby("Year").agg(["sum","mean",("极值",vary)])# 给函数更名str={"salary":["sum","mean"],"Year":vary
} 
df.groupby("name").agg(str)# 每列数据进行不同操作# *********************  apply 函数  ************************"""
apply 函数是pandas中自由度最高的函数
"""# 2016年份全*10defyear(Year):if Year ==2016:return Year*10else:return Year
df["Year"].apply(year)# 取出薪水排名前二的defwj(df,name,n):return df.sort_values(by = name)[-n:]
df.groupby("name").apply(wj,name ="salary",n =2)

10 读取、写入 Excel

import pandas as pd

data = pd.read_excel("./new_data.xlsx",header =1)"""
  指定第一行为列索引  默认为第0行,header可设置为None,此时列索引变为原来的数字索引
"""
data = pd.read_excel("./new_data.xlsx",header =None,sheet_name ="Sheet2")"""
多个表时指定表名
"""
data = pd.read_excel("./new_data.xlsx",header =None,sheet_name ="Sheet3",skiprows =9,usecols ="F:H")"""
拿指定数据
skiprows:跳过哪几行
usecols:使用哪几列 多写一列
"""

data = data.set_index("采样日期")# 写入前把第一列作为行索引
data.to_excel("./wj.xlsx")# 会把行索引写入

11 获取数据库中的数据

import pymysql
import pandas
# 创建一个连接
conn = pymysql.connect(host ="localhost",user ="root",passwd ="*******",db ="demo",port =3309,charset ="utf8")"""
host:本机或远程数据库
port:端口号 一般3309
"""
query ="SELECT id,name FROM num_table"# 方法里面需要填写sql语句
df = pd.read_sql_query(query,conn)

by CyrusMay 2022 04 05


本文转载自: https://blog.csdn.net/Cyrus_May/article/details/123969305
版权归原作者 CyrusMay 所有, 如有侵权,请联系我们删除。

“python数据分析—— pandas”的评论:

还没有评论