In [1]:
# Import all libraries needed for the tutorial
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import sys #only needed to determine Python version number
import matplotlib #only needed to determine Matplotlib version number
# Enable inline plotting
%matplotlib inline
In [2]:
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)
In [3]:
# The inital set of baby names
names = ['Bob','Jessica','Mary','John','Mel']
In [4]:
# This will ensure the random samples below can be reproduced.
# This means the random samples will always be identical.
random.seed?
In [5]:
random.randint?
In [6]:
len?
In [7]:
range?
In [8]:
zip?
In [9]:
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] for i in range(1000)]
# Print first 10 records
random_names[:10]
Out[9]:
In [10]:
# The number of births per name for the year 1880
births = [random.randint(low=0,high=1000) for i in range(1000)]
births[:10]
Out[10]:
In [11]:
BabyDataSet = list(zip(random_names,births))
BabyDataSet[:10]
Out[11]:
In [12]:
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df[:10]
Out[12]:
In [13]:
df.to_csv?
In [14]:
df.to_csv('births1880.txt',index=False,header=False)
In [15]:
pd.read_csv?
In [16]:
Location = r'C:\Users\david\notebooks\update\births1880.txt'
df = pd.read_csv(Location)
In [17]:
df.info()
In [18]:
df.head()
Out[18]:
In [19]:
df = pd.read_csv(Location, header=None)
df.info()
In [20]:
df.tail()
Out[20]:
In [21]:
df = pd.read_csv(Location, names=['Names','Births'])
df.head(5)
Out[21]:
In [22]:
import os
os.remove(Location)
In [23]:
# Method 1:
df['Names'].unique()
Out[23]:
In [24]:
# If you actually want to print the unique values:
for x in df['Names'].unique():
print(x)
In [25]:
# Method 2:
print(df['Names'].describe())
In [26]:
df.groupby?
In [27]:
# Create a groupby object
name = df.groupby('Names')
# Apply the sum function to the groupby object
df = name.sum()
df
Out[27]:
In [28]:
# Method 1:
Sorted = df.sort_values(['Births'], ascending=False)
Sorted.head(1)
Out[28]:
In [29]:
# Method 2:
df['Births'].max()
Out[29]:
In [30]:
# Create graph
df['Births'].plot.bar()
print("The most popular name")
df.sort_values(by='Births', ascending=False)
Out[30]:
No comments:
Post a Comment