# `pd` is the conventional alias for Pandas, as `np` is for NumPy
import pandas as pd


s = pd.Series(["welcome", "to", "data 100"])

s

0     welcome
1          to
2    data 100
dtype: object


s.values

array(['welcome', 'to', 'data 100'], dtype=object)


s.index

RangeIndex(start=0, stop=3, step=1)


s = pd.Series([-1, 10, 2], index = ["a", "b", "c"])
s

a    -1
b    10
c     2
dtype: int64


s.values

array([-1, 10,  2])


s.index

Index(['a', 'b', 'c'], dtype='object')


s.index = ["first", "second", "third"]
s

first     -1
second    10
third      2
dtype: int64


s = pd.Series([4, -2, 0, 6], index = ["a", "b", "c", "d"])
s

a    4
b   -2
c    0
d    6
dtype: int64


# Selection using a single label
# Notice how the return value is a single array element
s["a"]

4


# Selection using a list of labels
# Notice how the return value is another Series
s[["a", "c"]]

a    4
c    0
dtype: int64


# Filter condition: select all elements greater than 0
s>0

a     True
b    False
c    False
d     True
dtype: bool


# Use the Boolean filter to select data from the original Series
s[s>0]

a    4
d    6
dtype: int64


elections = pd.read_csv("data/elections.csv")
elections


elections = pd.read_csv("data/elections.csv", index_col = "Candidate")
elections


# Creating a single-column DataFrame using a list
df_list_1 = pd.DataFrame([1, 2, 3], 
                         columns = ["Number"])
display(df_list_1)


# Creating a multi-column DataFrame using a list of lists
df_list_2 = pd.DataFrame([[1, "one"], [2, "two"]], 
                         columns = ["Number", "Description"])
df_list_2


# Creating a DataFrame from a dictionary of columns
df_dict_1 = pd.DataFrame({"Fruit":["Strawberry", "Orange"], 
                          "Price":[5.49, 3.99]})
df_dict_1


# Creating a DataFrame from a list of row dictionaries
df_dict_2 = pd.DataFrame([{"Fruit":"Strawberry", "Price":5.49}, 
                          {"Fruit":"Orange", "Price":3.99}])
df_dict_2


# In the examples below, we create a DataFrame from a Series

s_a = pd.Series(["a1", "a2", "a3"], index = ["r1", "r2", "r3"])
s_b = pd.Series(["b1", "b2", "b3"], index = ["r1", "r2", "r3"])


# Passing Series objects for columns
df_ser = pd.DataFrame({"A-column":s_a, "B-column":s_b})
df_ser


# Passing a Series to the DataFrame constructor to make a one-column dataframe
df_ser = pd.DataFrame(s_a)
df_ser


# Using to_frame() to convert a Series to DataFrame
ser_to_df = s_a.to_frame()
ser_to_df


# Creating a DataFrame from a CSV file and specifying the Index column
elections = pd.read_csv("data/elections.csv", index_col = "Candidate")
elections.head(5) # Using `.head` shows only the first 5 rows to save space


elections.reset_index(inplace = True) # Need to reset the index to keep 'Candidate' as one of the DataFrane Columns
elections.set_index("Party", inplace=True) # This sets the index to the "Candidate" column
elections


elections.index

Index(['Democratic-Republican', 'Democratic-Republican', 'Democratic',
       'National Republican', 'Democratic', 'National Republican',
       'Anti-Masonic', 'Whig', 'Democratic', 'Whig',
       ...
       'Constitution', 'Republican', 'Independent', 'Libertarian',
       'Democratic', 'Green', 'Democratic', 'Republican', 'Libertarian',
       'Green'],
      dtype='object', name='Party', length=182)


elections.columns

Index(['Candidate', 'Year', 'Popular vote', 'Result', '%'], dtype='object')


elections.reset_index(inplace=True) # Revert the index back to its default numeric labeling
elections


elections.shape

(182, 6)


# Loading DataFrame again to keep the original ordering of columns
elections = pd.read_csv("data/elections.csv")

elections.head() # By default, calling .head with no argument will show the first 5 rows


elections.head(3)


elections.tail(5)


# Selection by a list
elections.loc[[87, 25, 179], ["Year", "Candidate", "Result"]]


# Selection by a list and a slice of columns
elections.loc[[87, 25, 179], "Popular vote":"%"]


# Extracting all rows using a colon
elections.loc[:, ["Year", "Candidate", "Result"]]


# Extracting all columns using a colon
elections.loc[[87, 25, 179], :]


# Selection by a list and a single-column label
elections.loc[[87, 25, 179], "Popular vote"]

87     15761254
25       848019
179    74216154
Name: Popular vote, dtype: int64


# Note that if we pass "Popular vote" in a list, the output will be a DataFrame
elections.loc[[87, 25, 179], ["Popular vote"]]


# Selection by a row label and a column label
elections.loc[0, "Candidate"]

'Andrew Jackson'


# Select the rows at positions 1, 2, and 3.
# Select the columns at positions 0, 1, and 2.
# Remember that Python indexing begins at position 0!
elections.iloc[[1, 2, 3], [0, 1, 2]]


# Index-based extraction using a list of rows and a slice of column indices
elections.iloc[[1, 2, 3], 0:3]


# Selecting all rows using a colon
elections.iloc[:, 0:3]


elections.iloc[[1, 2, 3], 1]

1    John Quincy Adams
2       Andrew Jackson
3    John Quincy Adams
Name: Candidate, dtype: object


# Extracting the value at row 0 and the second column
elections.iloc[0,1]

'Andrew Jackson'


elections[3:7]


elections[["Year", "Candidate", "Result"]]


elections["Candidate"]

0         Andrew Jackson
1      John Quincy Adams
2         Andrew Jackson
3      John Quincy Adams
4         Andrew Jackson
             ...        
177           Jill Stein
178         Joseph Biden
179         Donald Trump
180         Jo Jorgensen
181       Howard Hawkins
Name: Candidate, Length: 182, dtype: object


example = pd.Series([4, 5, 6], index=["one", "two", "three"])
example[example > 4].values

array([5, 6])


df = pd.DataFrame({"c1":[1, 2, 3, 4], "c2":[2, 4, 6, 8]})
df.columns

Index(['c1', 'c2'], dtype='object')

	Year	Candidate	Party	Popular vote	Result	%
0	1824	Andrew Jackson	Democratic-Republican	151271	loss	57.210122
1	1824	John Quincy Adams	Democratic-Republican	113142	win	42.789878
2	1828	Andrew Jackson	Democratic	642806	win	56.203927
3	1828	John Quincy Adams	National Republican	500897	loss	43.796073
4	1832	Andrew Jackson	Democratic	702735	win	54.574789
...	...	...	...	...	...	...
177	2016	Jill Stein	Green	1457226	loss	1.073699
178	2020	Joseph Biden	Democratic	81268924	win	51.311515
179	2020	Donald Trump	Republican	74216154	loss	46.858542
180	2020	Jo Jorgensen	Libertarian	1865724	loss	1.177979
181	2020	Howard Hawkins	Green	405035	loss	0.255731

	Candidate	Party	Popular vote	Result	%
Year
1824	Andrew Jackson	Democratic-Republican	151271	loss	57.210122
1824	John Quincy Adams	Democratic-Republican	113142	win	42.789878
1828	Andrew Jackson	Democratic	642806	win	56.203927
1828	John Quincy Adams	National Republican	500897	loss	43.796073
1832	Andrew Jackson	Democratic	702735	win	54.574789
...	...	...	...	...	...
2016	Jill Stein	Green	1457226	loss	1.073699
2020	Joseph Biden	Democratic	81268924	win	51.311515
2020	Donald Trump	Republican	74216154	loss	46.858542
2020	Jo Jorgensen	Libertarian	1865724	loss	1.177979
2020	Howard Hawkins	Green	405035	loss	0.255731

	Year	Party	Popular vote	Result	%
Candidate
Andrew Jackson	1824	Democratic-Republican	151271	loss	57.210122
John Quincy Adams	1824	Democratic-Republican	113142	win	42.789878
Andrew Jackson	1828	Democratic	642806	win	56.203927
John Quincy Adams	1828	National Republican	500897	loss	43.796073
Andrew Jackson	1832	Democratic	702735	win	54.574789

	Candidate	Year	Popular vote	Result	%
Party
Democratic-Republican	Andrew Jackson	1824	151271	loss	57.210122
Democratic-Republican	John Quincy Adams	1824	113142	win	42.789878
Democratic	Andrew Jackson	1828	642806	win	56.203927
National Republican	John Quincy Adams	1828	500897	loss	43.796073
Democratic	Andrew Jackson	1832	702735	win	54.574789
...	...	...	...	...	...
Green	Jill Stein	2016	1457226	loss	1.073699
Democratic	Joseph Biden	2020	81268924	win	51.311515
Republican	Donald Trump	2020	74216154	loss	46.858542
Libertarian	Jo Jorgensen	2020	1865724	loss	1.177979
Green	Howard Hawkins	2020	405035	loss	0.255731

	Party	Candidate	Year	Popular vote	Result	%
0	Democratic-Republican	Andrew Jackson	1824	151271	loss	57.210122
1	Democratic-Republican	John Quincy Adams	1824	113142	win	42.789878
2	Democratic	Andrew Jackson	1828	642806	win	56.203927
3	National Republican	John Quincy Adams	1828	500897	loss	43.796073
4	Democratic	Andrew Jackson	1832	702735	win	54.574789
...	...	...	...	...	...	...
177	Green	Jill Stein	2016	1457226	loss	1.073699
178	Democratic	Joseph Biden	2020	81268924	win	51.311515
179	Republican	Donald Trump	2020	74216154	loss	46.858542
180	Libertarian	Jo Jorgensen	2020	1865724	loss	1.177979
181	Green	Howard Hawkins	2020	405035	loss	0.255731

Lecture 2 – Data 100, Spring 2024¶

Series, DataFrames, and Indices¶

Series¶

Creating a new `Series` object¶

Selection in Series¶

DataFrame¶

Creating a new `DataFrame` object¶

Creating a `DataFrame` from a CSV file¶

Creating a `DataFrame` using a list and column names¶

Creating a `DataFrame` from a dictionary¶

Creating a `DataFrame` from a `Series`¶

`DataFrame` attributes: `index`, `columns`, and `shape`¶

Slicing in `DataFrame`s¶

Label-Based Extraction Using`loc`¶

Integer-Based Extraction Using `iloc`¶

Context-dependent Extraction using `[]`¶

Slido Exercises¶

	Year	Candidate	Result
87	1932	Herbert Hoover	loss
25	1860	John C. Breckinridge	loss
179	2020	Donald Trump	loss

	Popular vote	Result	%
87	15761254	loss	39.830594
25	848019	loss	18.138998
179	74216154	loss	46.858542

	Number
0	1
1	2
2	3

	Number	Description
0	1	one
1	2	two

	Fruit	Price
0	Strawberry	5.49
1	Orange	3.99

	Fruit	Price
0	Strawberry	5.49
1	Orange	3.99

Lecture 2 – Data 100, Spring 2024¶

Series, DataFrames, and Indices¶

Series¶

Creating a new Series object¶

Selection in Series¶

DataFrame¶

Creating a new DataFrame object¶

Creating a DataFrame from a CSV file¶

Creating a DataFrame using a list and column names¶

Creating a DataFrame from a dictionary¶

Creating a DataFrame from a Series¶

DataFrame attributes: index, columns, and shape¶

Slicing in DataFrames¶

Label-Based Extraction Usingloc¶

Integer-Based Extraction Using iloc¶

Context-dependent Extraction using []¶

Slido Exercises¶

Creating a new `Series` object¶

Creating a new `DataFrame` object¶

Creating a `DataFrame` from a CSV file¶

Creating a `DataFrame` using a list and column names¶

Creating a `DataFrame` from a dictionary¶

Creating a `DataFrame` from a `Series`¶

`DataFrame` attributes: `index`, `columns`, and `shape`¶

Slicing in `DataFrame`s¶

Label-Based Extraction Using`loc`¶

Integer-Based Extraction Using `iloc`¶

Context-dependent Extraction using `[]`¶