# Linear algebra, probability
import numpy as np

# Data manipulation
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Interactive visualization library
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px


# pd stands for pandas, which we will learn starting from next lecture
# Some pandas syntax shared with data8's datascience package
majors = pd.read_csv("data/majors-sp24.csv")
names = pd.read_csv("data/names-sp24.csv")


# Let's peek at the first 20 rows of the majors dataframe

majors.head(20)


# Let's peek at the first 5 rows (default) of the names dataframe

names.head()


names['Name'] = names['Name'].str.lower()


names.head()


# Below are the most common, in descending frequency

first_letter = names['Name'].str[0].value_counts()
first_letter.head()

Name
a    185
j    136
s    133
m     99
r     65
Name: count, dtype: int64


# Let's visualize this first letter distribution 

plt.bar(first_letter.index, first_letter.values)
plt.xlabel('First Letter')
plt.ylabel('Frequency')
plt.title('First Letter Frequency Distribution')
plt.show()


name_lengths = names['Name'].str.len()

plt.hist(name_lengths, bins=range(min(name_lengths), max(name_lengths) + 2), edgecolor='black')
plt.xlabel('Name Length')
plt.ylabel('Frequency')
plt.title('Distribution of Length of Names')
average_length = name_lengths.sum() / len(name_lengths)
plt.axvline(average_length, color='red', linestyle='dashed', linewidth=1, label=f'Average: {average_length:.2f}')
plt.legend()
plt.xticks(range(min(name_lengths), max(name_lengths) + 1))
plt.show()


print(len(names))
print(len(majors))

1276
1276


names.head()


# Boolean index to find rows where Role is #REF!
names[names['Name'] == "#ref!"]


names = names[names['Name'] != "#ref!"]


names['Name'].value_counts().to_frame()  # Again, counts of unique Roles


majors.columns   # Get column names

Index(['Majors', 'Terms in Attendance'], dtype='object')


majors['Terms in Attendance'].value_counts().to_frame()


majors[majors['Terms in Attendance'] == "#REF!"]


majors = majors[majors['Terms in Attendance'] != "#REF!"]
majors['Terms in Attendance'].value_counts().to_frame()


names.describe()


majors.describe()


majors_count = (       # Method chaining in pandas
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False) # Highest first
    .to_frame()
    .head(20)          # Get the top 20
)

majors_count


# Interactive using plotly

fig = px.bar(majors_count.loc[::-1], orientation='h')
fig.update_layout(showlegend=False, 
                  xaxis_title='Count', 
                  yaxis_title='Major',
                  autosize=False, 
                  width=800, 
                  height=500)


fig = px.histogram(majors['Terms in Attendance'].sort_values(),
                   histnorm='probability')

fig.update_layout(showlegend=False,
                  xaxis_title="Term",
                  yaxis_title="Fraction of Class",
                  autosize=False, 
                  width=800, 
                  height=250)


# Replacing terms in attendance data with the degree objective
majors.loc[majors.loc[:, 'Terms in Attendance'] != 'G', 'Terms in Attendance'] = 'Undergraduate'
majors.loc[majors.loc[:, 'Terms in Attendance'] == 'G', 'Terms in Attendance'] = 'Graduate'


majors.rename(columns={'Terms in Attendance': 'Ungrad Grad'}, inplace=True)

majors.describe()


print(majors.columns)
print(names.columns)

Index(['Majors', 'Ungrad Grad'], dtype='object')
Index(['Name'], dtype='object')


url = "https://docs.google.com/spreadsheets/d/1J7tz3GQLs3M6hFseJCE9KhjVhe4vKga8Q2ezu0oG5sQ/gviz/tq?tqx=out:csv"

university_majors = pd.read_csv(url, 
                                usecols = ['Academic Yr', 'Semester', 'Ungrad Grad', 
                                           'Entry Status', 'Major Short Nm', 'Student Headcount'])


# Examining the data
university_majors


# Reporting student data based on academic year
university_majors = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad', 'Entry Status', 'Major Short Nm'], as_index = False)[["Student Headcount"]]
                     .mean()
                    )

university_majors


university_grad_vs_ungrd = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad'], as_index = False)[["Student Headcount"]]
                            .sum()
                           )

proportions = university_grad_vs_ungrd.pivot(index='Academic Yr', columns='Ungrad Grad', values='Student Headcount')
proportions['Total'] = proportions['Undergraduate'] + proportions['Graduate']
proportions['Undergrad Proportion'] = proportions['Undergraduate'] / proportions['Total']
proportions['Grad Proportion'] = proportions['Graduate'] / proportions['Total']


fig = px.bar(proportions.reset_index(), 
             x='Academic Yr', 
             y=['Undergraduate', 'Graduate'],
             title='Number of Grad vs. Undergrad Students',
             labels={'value': 'Number of Students'},
             color_discrete_map={'Undergraduate': 'blue', 'Graduate': 'orange'})

fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()


data100_grad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Graduate'].count()

data100_undergrad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Undergraduate'].count()

print("Number of graduate students in Data 100: ", data100_grad)
print("Number of undergraduate students in Data 100: ", data100_undergrad)

Number of graduate students in Data 100:  52
Number of undergraduate students in Data 100:  1223


data100_row = {'Graduate':[data100_grad], 
               'Undergraduate':[data100_undergrad], 
               'Total':[data100_grad + data100_undergrad], 
               'Undergrad Proportion':[data100_undergrad / (data100_grad + data100_undergrad)],
               'Grad Proportion':[data100_grad / (data100_grad + data100_undergrad)], 
               }

new_row_df = pd.DataFrame(data100_row)

proportions.loc['Data 100'] = new_row_df.iloc[0]


fig = px.bar(proportions.reset_index(), 
             x='Academic Yr', 
             y=['Undergrad Proportion', 'Grad Proportion'],
             title='Proportions of Grad vs. Undergrad Students',
             labels={'value': 'Proportion'},
             color_discrete_map={'Undergrad Proportion': 'blue', 'Grad Proportion': 'orange'})


fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()


data100_top_20_majors = (       # Method chaining in pandas
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False) # Highest first
    .to_frame()
    .head(20)          # Get the top 20
)

major_trends = university_majors.groupby(['Academic Yr', 'Major Short Nm'], 
                                         as_index = False)[["Student Headcount"]].sum()


print("Top 20 majors at Berkeley in 2022-23")
major_trends[major_trends.loc[:, 'Academic Yr'] == '2022-23'].sort_values('Student Headcount', ascending=False).head(20)

Top 20 majors at Berkeley in 2022-23


print("Top 20 majors at Berkeley since 2013")
major_trends.groupby(['Major Short Nm'], as_index = False)[['Student Headcount']].sum().sort_values('Student Headcount', ascending=False).head(20)

Top 20 majors at Berkeley since 2013


print("Top 20 majors at Berkeley in Data 100")
print(data100_top_20_majors)

Top 20 majors at Berkeley in Data 100
                                                    count
Majors                                                   
Letters & Sci Undeclared UG                           341
Computer Science BA                                   148
Data Science BA (Subplan: Business/Industrial A...     50
Electrical Eng & Comp Sci BS                           43
Economics BA                                           37
Cognitive Science BA                                   33
Data Science BA (Subplan: Economics)                   32
Civil Engineering BS                                   30
Data Science BA (Subplan: Applied Mathematics &...     23
Mol Sci & Software Engin MMSSE (Subplan: Part-T...     22
Letters & Sci Undeclared UG (Subplan: Applied H...     19
Economics BA, Minor: Data Science UG                   17
Applied Mathematics BA (Subplan: Data Science)         15
Industrial Eng & Ops Rsch BS                           15
Data Science BA (Subplan: Cognition)                   14
Data Science BA (Subplan: Robotics)                    12
Electrical Eng & Comp Sci BS, Minor: Data Scien...     11
Bioengineering BS                                      10
Applied Mathematics BA                                  9
Business Administration BS                              8


data100_top_20_majors.index = data100_top_20_majors.index.str.rsplit(' ', n=1).str[0]
print("Top 20 majors at Berkeley in Data 100")
print(data100_top_20_majors)

Top 20 majors at Berkeley in Data 100
                                                    count
Majors                                                   
Letters & Sci Undeclared                              341
Computer Science                                      148
Data Science BA (Subplan: Business/Industrial          50
Electrical Eng & Comp Sci                              43
Economics                                              37
Cognitive Science                                      33
Data Science BA (Subplan:                              32
Civil Engineering                                      30
Data Science BA (Subplan: Applied Mathematics &        23
Mol Sci & Software Engin MMSSE (Subplan:               22
Letters & Sci Undeclared UG (Subplan: Applied H...     19
Economics BA, Minor: Data Science                      17
Applied Mathematics BA (Subplan: Data                  15
Industrial Eng & Ops Rsch                              15
Data Science BA (Subplan:                              14
Data Science BA (Subplan:                              12
Electrical Eng & Comp Sci BS, Minor: Data Science      11
Bioengineering                                         10
Applied Mathematics                                     9
Business Administration                                 8


fig = px.line(major_trends[major_trends["Major Short Nm"].isin(data100_top_20_majors.index)], 
                        x = "Academic Yr", y = "Student Headcount", color = "Major Short Nm")

fig.update_layout(autosize=False, width=800, height=600)
fig.show()


data100_top_19_majors = data100_top_20_majors.iloc[1:,:]

fig = px.line(major_trends[major_trends["Major Short Nm"].isin(data100_top_19_majors.index)], 
                        x = "Academic Yr", y = "Student Headcount", color = "Major Short Nm")

fig.update_layout(autosize=False, width=800, height=600)
fig.show()

	Majors	Terms in Attendance
0	Electrical Eng & Comp Sci BS	8.0
1	Data Science BA (Subplan: Robotics)	6.0
2	Electrical Eng & Comp Sci MEng	G
3	Letters & Sci Undeclared UG	4.0
4	Molecular & Cell Biology BA (Subplan: Biochem,...	6.0
5	Civil Engineering BS	8.0
6	Computer Science BA	4.0
7	Political Economy BA	6.0
8	Letters & Sci Undeclared UG	8.0
9	Art BA	4.0
10	Chemical Biology BS	6.0
11	Letters & Sci Undeclared UG	4.0
12	Chemical Engineering PhD	G
13	Computer Science BA	4.0
14	Computer Science BA	4.0
15	Environmental Sciences BS	2.0
16	Letters & Sci Undeclared UG	4.0
17	Letters & Sci Undeclared UG	2.0
18	Letters & Sci Undeclared UG	6.0
19	Cognitive Science BA	4.0

	Name
0	Emily
1	Andrew
2	Andrew
3	NATHAN
4	Yimo

	Name
0	emily
1	andrew
2	andrew
3	nathan
4	yimo

	Name
0	emily
1	andrew
2	andrew
3	nathan
4	yimo

	count
Name
matthew	13
ethan	9
andrew	9
emily	8
nathan	8
...	...
wolfgang	1
sukriti	1
in	1
teresa	1
olivia	1

Lecture 1 – Data 100, Spring 2024¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

Exploratory Data Analysis on `names` dataset¶

What is the most common first letter in names? What is its distribution?¶

What is the distribution of the length of names?¶

How many records do we have?¶

Understanding the structure of data¶

Exploratory Data Analysis on `majors` dataset¶

Summarizing the Data¶

4. Analysis: Understanding the World¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

1. New Questions¶

How could we answer this question?¶

We don't have the data.¶

UC Berkeley Student Headcount by Major and Demographics¶

2. Acquire data programmatically¶

3. Exploratory Data Analysis on Campus Data¶

What is the historical distribution of graduate and undergraduate students at Berkeley?¶

4.1. Ratio between graduate and undergraduate students in Data 100, and its comparison with campus distribution¶

4.2. Proportion of different majors in Data 100, and their historical emrollment trends¶

	count
Terms in Attendance
4.0	539
6.0	407
8.0	168
G	52
2.0	43
7.0	30
5.0	13
#REF!	1
3.0	1

	Majors	Terms in Attendance
count	1275	1253
unique	259	8
top	Letters & Sci Undeclared UG	4.0
freq	341	539

	Academic Yr	Semester	Ungrad Grad	Entry Status	Major Short Nm	Student Headcount
0	2013-14	Fall	Graduate	Graduate	Education	327
1	2013-14	Fall	Graduate	Graduate	Special Education	14
2	2013-14	Fall	Graduate	Graduate	Science & Math Education	16
3	2013-14	Fall	Graduate	Graduate	Chemical Engineering	132
4	2013-14	Fall	Graduate	Graduate	Chemistry	404
...	...	...	...	...	...	...
7278	2022-23	Spring	Undergraduate	Transfer Entrant	Nut Sci-Physio & Metabol	20
7279	2022-23	Spring	Undergraduate	Transfer Entrant	Nutritional Sci-Dietetics	3
7280	2022-23	Spring	Undergraduate	Transfer Entrant	Nutritional Sci-Toxicology	3
7281	2022-23	Spring	Undergraduate	Transfer Entrant	Genetics & Plant Biology	10
7282	2022-23	Spring	Undergraduate	Transfer Entrant	Microbial Biology	49

	Academic Yr	Major Short Nm	Student Headcount
1993	2022-23	Letters & Sci Undeclared	10651.0
1983	2022-23	L&S Computer Science	2102.5
1932	2022-23	Electrical Eng & Comp Sci	2093.0
1894	2022-23	Business Administration	1645.5
1928	2022-23	Economics	1579.5
1984	2022-23	L&S Data Science	1325.5
2020	2022-23	Molecular & Cell Biology	1225.5
2011	2022-23	Mechanical Engineering	1208.0
1992	2022-23	Law (JD)	1023.0
1973	2022-23	Info & Data Science-MIDS	1021.5
2042	2022-23	Political Science	1005.0
1948	2022-23	Evening & Weekend MBA	919.0
2043	2022-23	Psychology	760.0
1901	2022-23	Chemistry	691.0
2057	2022-23	Sociology	663.0
1881	2022-23	Architecture	604.5
1900	2022-23	Chemical Engineering	595.0
1889	2022-23	Bioengineering	576.0
1912	2022-23	Cognitive Science	505.5
1879	2022-23	Applied Mathematics	497.0

	Major Short Nm	Student Headcount
150	Letters & Sci Undeclared	101418.0
77	Electrical Eng & Comp Sci	18431.0
137	L&S Computer Science	14818.0
33	Business Administration	14302.5
72	Economics	14000.0
216	Political Science	10334.0
176	Mechanical Engineering	10193.5
149	Law (JD)	9645.5
95	Evening & Weekend MBA	7932.5
233	Sociology	6719.5
217	Psychology	6533.0
40	Chemical Engineering	6126.5
41	Chemistry	5941.5
13	Architecture	5680.5
85	English	5520.0
187	Molecular & Cell Biology	5431.0
125	Info & Data Science-MIDS	5105.0
26	Bioengineering	5061.0
128	Integrative Biology	5060.5
211	Physics	5055.0

Lecture 1 – Data 100, Spring 2024¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

Exploratory Data Analysis on names dataset¶

What is the most common first letter in names? What is its distribution?¶

What is the distribution of the length of names?¶

How many records do we have?¶

Understanding the structure of data¶

Exploratory Data Analysis on majors dataset¶

Summarizing the Data¶

4. Analysis: Understanding the World¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

1. New Questions¶

How could we answer this question?¶

We don't have the data.¶

UC Berkeley Student Headcount by Major and Demographics¶

2. Acquire data programmatically¶

3. Exploratory Data Analysis on Campus Data¶

What is the historical distribution of graduate and undergraduate students at Berkeley?¶

4.1. Ratio between graduate and undergraduate students in Data 100, and its comparison with campus distribution¶

4.2. Proportion of different majors in Data 100, and their historical emrollment trends¶

Exploratory Data Analysis on `names` dataset¶

Exploratory Data Analysis on `majors` dataset¶