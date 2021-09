import pandas as pd

from pathlib import Path



filepath = 'test0.csv'

print(Path(filepath).read_text())

#0.0,1.1,2.2

#3.3,4.4,5.5

#6.6,7.7,8.8



df = pd.read_csv(filepath)

print(df)

# 0.0 1.1 2.2

#0 3.3 4.4 5.5

#1 6.6 7.7 8.8



# ヘッダー行がないことを指定

df = pd.read_csv(filepath, header=None)

print(df)

# 0 1 2

#0 0.0 1.1 2.2

#1 3.3 4.4 5.5

#2 6.6 7.7 8.8



# 列名を指定

names = ['col0', 'col1', 'col2']

df = pd.read_csv(filepath, names=names)

print(df)

# col0 col1 col2

#0 0.0 1.1 2.2

#1 3.3 4.4 5.5

#2 6.6 7.7 8.8



# ヘッダー行がある場合

filepath = 'test1.csv'

print(Path(filepath).read_text())

#col0,col1,col2

#0.0,1.1,2.2

#3.3,4.4,5.5

#6.6,7.7,8.8



df = pd.read_csv(filepath)

print(df)

# col0 col1 col2 # ヘッダー行から列名を推測してくれる

#0 0.0 1.1 2.2

#1 3.3 4.4 5.5

#2 6.6 7.7 8.8



# 列名をヘッダー行から推測せずに、明示的に指定する

names = ['foo', 'bar', 'baz']

df = pd.read_csv(filepath, names=names, header=0)

print(df)

# foo bar baz

#0 0.0 1.1 2.2

#1 3.3 4.4 5.5

#2 6.6 7.7 8.8



# 区切り文字の指定

filepath = 'test2.csv'

print(Path(filepath).read_text())

#col0 col1 col2 # 空白文字で区切っている

#0.0 1.1 2.2

#3.3 4.4 5.5

#6.6 7.7 8.8



df = pd.read_csv(filepath, sep=' ')

print(df)

# col0 col1 col2

#0 0.0 1.1 2.2

#1 3.3 4.4 5.5

#2 6.6 7.7 8.8



# 読み込む列の指定

filepath = 'test1.csv'

df = pd.read_csv(filepath, usecols=[0, 2])

print(df)

# col0 col2

#0 0.0 2.2

#1 3.3 5.5

#2 6.6 8.8



df = pd.read_csv(filepath, usecols=['col0', 'col2'])

print(df) # 出力は省略



df = pd.read_csv(filepath, usecols=lambda x: x in ['col0', 'col2'])

print(df) # 出力は省略



# 行ラベル(インデックス)となる列の指定

filepath = 'test3.csv'

print(Path(filepath).read_text())

#,col1,col2,col3

#row0,0.0,1.1,2.2

#row1,3.3,4.4,5.5

#row2,6.6,7.7,8.8



df = pd.read_csv(filepath, index_col=0)

print(df)

# col1 col2 col3

#IDX

#row0 0.0 1.1 2.2

#row1 3.3 4.4 5.5

#row2 6.6 7.7 8.8



df = pd.read_csv(filepath, index_col='IDX')

print(df) # 出力は省略



# データ型の指定

filepath = 'test4.csv'

print(Path(filepath).read_text())

#area,tel,value

#tokyo,0312345678,1.0

#kanagawa,045678901,2.0

#chiba,043210987,3.0



df = pd.read_csv(filepath)

print(df)

# area tel value # 電話番号が整数値になっている

#0 tokyo 312345678 1.0

#1 kanagawa 45678901 2.0

#2 chiba 43210987 3.0



df = pd.read_csv(filepath, dtype=str) # 全てのデータの型をstrに

print(df)

# area tel value

#0 tokyo 0312345678 1.0

#1 kanagawa 045678901 2.0

#2 chiba 043210987 3.0



df = pd.read_csv(filepath, dtype={0: str, 1: str, 2: float})

print(df) # 出力は省略



# 日付のパース

filepath = 'test5.csv'

print(Path(filepath).read_text())

#date,value0,value1

#2021/09/07,1.0,2.0

#2021/09/08,3.0,4.0

#2021/09/09,5.0,5.0



df = pd.read_csv(filepath, parse_dates=True, index_col=0)

print(df)

# value0 value1

#date

#2021-09-07 1.0 2.0

#2021-09-08 3.0 4.0

#2021-09-09 5.0 5.0



df = pd.read_csv(filepath, parse_dates=[0])

print(df)

# date value0 value1

#0 2021-09-07 1.0 2.0

#1 2021-09-08 3.0 4.0

#2 2021-09-09 5.0 5.0



df = pd.read_csv(filepath, parse_dates=['date'])

print(df) # 出力は省略



filepath = 'test6.csv'

print(Path(filepath).read_text())

#year,month,day,value0,value1

#2021,9,7,1.0,2.0

#2021,9,8,3.0,5.0

#2021,9,9,4.0,6.0



dates = [['year', 'month', 'day']]

df = pd.read_csv(filepath, parse_dates=dates)

print(df)

# year_month_day value0 value1

#0 2021-09-07 1.0 2.0

#1 2021-09-08 3.0 5.0

#2 2021-09-09 4.0 6.0



dates = {'date': ['year', 'month', 'day']}

df = pd.read_csv(filepath, parse_dates=dates)

print(df)

# date value0 value1

#0 2021-09-07 1.0 2.0

#1 2021-09-08 3.0 5.0

#2 2021-09-09 4.0 6.0



# 欠損値の扱い

filepath = 'test7.csv'

print(Path(filepath).read_text())

# col0,col1,col2

# ,nan,1.0

# 2.0,N/A,null

# NaN,3.0,--



df = pd.read_csv(filepath)

print(df)

# col0 col1 col2

#0 NaN NaN 1.0

#1 2.0 NaN NaN

#2 NaN 3.0 -- # 「--」という文字列は欠損値としては扱われていない



df = pd.read_csv(filepath, na_values=['--'])

print(df)

# col0 col1 col2 # デフォルトの欠損値とna_valuesに指定した値が欠損値

#0 NaN NaN 1.0

#1 2.0 NaN NaN

#2 NaN 3.0 NaN



df = pd.read_csv(filepath, keep_default_na=False, na_values=['--'])

print(df)

# col0 col1 col2 # na_valuesに指定した値のみが欠損値として扱われる

#0 nan 1.0

#1 2.0 N/A null

#2 NaN 3.0 NaN