# Linear algebra, probability
import numpy as np

# Data manipulation
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Interactive visualization library
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px

# pd stands for pandas, which we will learn starting from next lecture
# Some pandas syntax shared with data8's datascience package
majors = pd.read_csv("data/majors.csv")
names = pd.read_csv("data/names.csv")

# Let's peek at the first 20 rows of the majors dataframe

majors.head(20)

# Let's peek at the first 5 rows (default) of the names dataframe

names.head()

names['Name'] = names['Name'].str.lower()

names.head()

# Below are the most common, in descending frequency

first_letter = names['Name'].str[0].value_counts()
first_letter.head()

Name
a    192
j    137
s    130
m     97
e     72
Name: count, dtype: int64

# Let's visualize this first letter distribution 

plt.bar(first_letter.index, first_letter.values)
plt.xlabel('First Letter')
plt.ylabel('Frequency')
plt.title('First Letter Frequency Distribution')
plt.show()

name_lengths = names['Name'].str.len()

plt.hist(name_lengths, bins=range(min(name_lengths), max(name_lengths) + 2), edgecolor='black')
plt.xlabel('Name Length')
plt.ylabel('Frequency')
plt.title('Distribution of Length of Names')
average_length = name_lengths.sum() / len(name_lengths)
plt.axvline(average_length, color='red', linestyle='dashed', linewidth=1, label=f'Average: {average_length:.2f}')
plt.legend()
plt.xticks(range(min(name_lengths), max(name_lengths) + 1))
plt.show()

print(len(names))
print(len(majors))

1291
1291

names.head()

names['Role'].value_counts().to_frame()  # counts of unique Roles

# Boolean index to find rows where Role is #REF!
names[names['Role'] == "#REF!"]

names = names[names['Role'] != "#REF!"]

names['Role'].value_counts().to_frame()  # Again, counts of unique Roles

majors.columns   # Get column names

Index(['Majors', 'Terms in Attendance'], dtype='object')

majors['Terms in Attendance'].value_counts().to_frame()

majors[majors['Terms in Attendance'] == "#REF!"]

majors = majors[majors['Terms in Attendance'] != "#REF!"]
majors['Terms in Attendance'].value_counts().to_frame()

names.describe()

majors.describe()

majors_count = (       # Method chaining in pandas
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False) # Highest first
    .to_frame()
    .head(20)          # Get the top 20
)

majors_count

# Interactive using plotly

fig = px.bar(majors_count.loc[::-1], orientation='h')
fig.update_layout(showlegend=False, 
                  xaxis_title='Count', 
                  yaxis_title='Major',
                  autosize=False, 
                  width=800, 
                  height=500)

fig = px.histogram(majors['Terms in Attendance'].sort_values(),
                   histnorm='probability')

fig.update_layout(showlegend=False,
                  xaxis_title="Term",
                  yaxis_title="Fraction of Class",
                  autosize=False, 
                  width=800, 
                  height=250)

# Replacing terms in attendance data with the degree objective
majors.loc[majors.loc[:, 'Terms in Attendance'] != 'G', 'Terms in Attendance'] = 'Undergraduate'
majors.loc[majors.loc[:, 'Terms in Attendance'] == 'G', 'Terms in Attendance'] = 'Graduate'


majors.rename(columns={'Terms in Attendance': 'Ungrad Grad'}, inplace=True)

majors.describe()

print(majors.columns)
print(names.columns)

Index(['Majors', 'Ungrad Grad'], dtype='object')
Index(['Name', 'Role'], dtype='object')

url = "https://docs.google.com/spreadsheets/d/1J7tz3GQLs3M6hFseJCE9KhjVhe4vKga8Q2ezu0oG5sQ/gviz/tq?tqx=out:csv"

university_majors = pd.read_csv(url, 
                                usecols = ['Academic Yr', 'Semester', 'Ungrad Grad', 
                                           'Entry Status', 'Major Short Nm', 'Student Headcount'])

# Examining the data
university_majors

# Reporting student data based on academic year
university_majors = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad', 'Entry Status', 'Major Short Nm'], as_index = False)[["Student Headcount"]]
                     .mean()
                    )

university_majors

university_grad_vs_ungrd = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad'], as_index = False)[["Student Headcount"]]
                            .sum()
                           )

proportions = university_grad_vs_ungrd.pivot(index='Academic Yr', columns='Ungrad Grad', values='Student Headcount')
proportions['Total'] = proportions['Undergraduate'] + proportions['Graduate']
proportions['Undergrad Proportion'] = proportions['Undergraduate'] / proportions['Total']
proportions['Grad Proportion'] = proportions['Graduate'] / proportions['Total']


fig = px.bar(proportions.reset_index(), 
             x='Academic Yr', 
             y=['Undergraduate', 'Graduate'],
             title='Number of Grad vs. Undergrad Students',
             labels={'value': 'Number of Students'},
             color_discrete_map={'Undergraduate': 'blue', 'Graduate': 'orange'})

fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()

data100_grad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Graduate'].count()

data100_undergrad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Undergraduate'].count()

print("Number of graduate students in Data 100: ", data100_grad)
print("Number of undergraduate students in Data 100: ", data100_undergrad)

Number of graduate students in Data 100:  124
Number of undergraduate students in Data 100:  1166

data100_row = {'Graduate':[data100_grad], 
               'Undergraduate':[data100_undergrad], 
               'Total':[data100_grad + data100_undergrad], 
               'Undergrad Proportion':[data100_undergrad / (data100_grad + data100_undergrad)],
               'Grad Proportion':[data100_grad / (data100_grad + data100_undergrad)], 
               }

new_row_df = pd.DataFrame(data100_row)

proportions.loc['Data 100'] = new_row_df.iloc[0]


fig = px.bar(proportions.reset_index(), 
             x='Academic Yr', 
             y=['Undergrad Proportion', 'Grad Proportion'],
             title='Proportions of Grad vs. Undergrad Students',
             labels={'value': 'Proportion'},
             color_discrete_map={'Undergrad Proportion': 'blue', 'Grad Proportion': 'orange'})


fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()

data100_top_20_majors = (       # Method chaining in pandas
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False) # Highest first
    .to_frame()
    .head(20)          # Get the top 20
)

major_trends = university_majors.groupby(['Academic Yr', 'Major Short Nm'], 
                                         as_index = False)[["Student Headcount"]].sum()


print("Top 20 majors at Berkeley in 2022-23")
major_trends[major_trends.loc[:, 'Academic Yr'] == '2022-23'].sort_values('Student Headcount', ascending=False).head(20)

Top 20 majors at Berkeley in 2022-23

print("Top 20 majors at Berkeley since 2013")
major_trends.groupby(['Major Short Nm'], as_index = False)[['Student Headcount']].sum().sort_values('Student Headcount', ascending=False).head(20)

Top 20 majors at Berkeley since 2013

print("Top 20 majors at Berkeley in Data 100")
print(data100_top_20_majors)

Top 20 majors at Berkeley in Data 100
                                                    count
Majors                                                   
Letters & Sci Undeclared UG                           353
Data Science BA                                       129
Computer Science BA                                   124
Electrical Eng & Comp Sci BS                           69
Economics BA                                           65
Cognitive Science BA                                   29
Electrical Eng & Comp Sci MEng                         25
Applied Mathematics BA                                 24
Civil Engineering BS                                   22
Mol Sci & Software Engin MMSSE                         19
Business Administration BS, Letters & Sci Undec...     19
Data Science BA, Economics BA                          17
Chemical Engineering BS                                15
Development Engineering MDE                            13
Public Policy MPP                                      12
Computer Science BA, Data Science BA                   11
Info Mgmt & Systems MIMS                               10
Molecular & Cell Biology BA                            10
Bioengineering BS                                       9
Public Health BA                                        8

data100_top_20_majors.index = data100_top_20_majors.index.str.rsplit(' ', n=1).str[0]
print("Top 20 majors at Berkeley in Data 100")
print(data100_top_20_majors)

Top 20 majors at Berkeley in Data 100
                                                    count
Majors                                                   
Letters & Sci Undeclared                              353
Data Science                                          129
Computer Science                                      124
Electrical Eng & Comp Sci                              69
Economics                                              65
Cognitive Science                                      29
Electrical Eng & Comp Sci                              25
Applied Mathematics                                    24
Civil Engineering                                      22
Mol Sci & Software Engin                               19
Business Administration BS, Letters & Sci Undec...     19
Data Science BA, Economics                             17
Chemical Engineering                                   15
Development Engineering                                13
Public Policy                                          12
Computer Science BA, Data Science                      11
Info Mgmt & Systems                                    10
Molecular & Cell Biology                               10
Bioengineering                                          9
Public Health                                           8

fig = px.line(major_trends[major_trends["Major Short Nm"].isin(data100_top_20_majors.index)], 
                        x = "Academic Yr", y = "Student Headcount", color = "Major Short Nm")

fig.update_layout(autosize=False, width=800, height=600)
fig.show()

data100_top_19_majors = data100_top_20_majors.iloc[1:,:]

fig = px.line(major_trends[major_trends["Major Short Nm"].isin(data100_top_19_majors.index)], 
                        x = "Academic Yr", y = "Student Headcount", color = "Major Short Nm")

fig.update_layout(autosize=False, width=800, height=600)
fig.show()

	Academic Yr	Semester	Ungrad Grad	Entry Status	Major Short Nm	Student Headcount
0	2013-14	Fall	Graduate	Graduate	Education	327
1	2013-14	Fall	Graduate	Graduate	Special Education	14
2	2013-14	Fall	Graduate	Graduate	Science & Math Education	16
3	2013-14	Fall	Graduate	Graduate	Chemical Engineering	132
4	2013-14	Fall	Graduate	Graduate	Chemistry	404
...	...	...	...	...	...	...
7278	2022-23	Spring	Undergraduate	Transfer Entrant	Nut Sci-Physio & Metabol	20
7279	2022-23	Spring	Undergraduate	Transfer Entrant	Nutritional Sci-Dietetics	3
7280	2022-23	Spring	Undergraduate	Transfer Entrant	Nutritional Sci-Toxicology	3
7281	2022-23	Spring	Undergraduate	Transfer Entrant	Genetics & Plant Biology	10
7282	2022-23	Spring	Undergraduate	Transfer Entrant	Microbial Biology	49

	Academic Yr	Ungrad Grad	Entry Status	Major Short Nm	Student Headcount
0	2013-14	Graduate	Graduate	African American Studies	30.0
1	2013-14	Graduate	Graduate	Ag & Resource Economics	71.5
2	2013-14	Graduate	Graduate	Anc Hist & Medit Archae	15.0
3	2013-14	Graduate	Graduate	Anthropology	88.5
4	2013-14	Graduate	Graduate	Applied Mathematics	16.5
...	...	...	...	...	...
3735	2022-23	Undergraduate	Transfer Entrant	Spanish and Portuguese	12.0
3736	2022-23	Undergraduate	Transfer Entrant	Statistics	35.0
3737	2022-23	Undergraduate	Transfer Entrant	Sustainable Environ Dsgn	4.5
3738	2022-23	Undergraduate	Transfer Entrant	Theater & Perf Studies	44.0
3739	2022-23	Undergraduate	Transfer Entrant	Urban Studies	16.0

	Academic Yr	Major Short Nm	Student Headcount
1993	2022-23	Letters & Sci Undeclared	10651.0
1983	2022-23	L&S Computer Science	2102.5
1932	2022-23	Electrical Eng & Comp Sci	2093.0
1894	2022-23	Business Administration	1645.5
1928	2022-23	Economics	1579.5
1984	2022-23	L&S Data Science	1325.5
2020	2022-23	Molecular & Cell Biology	1225.5
2011	2022-23	Mechanical Engineering	1208.0
1992	2022-23	Law (JD)	1023.0
1973	2022-23	Info & Data Science-MIDS	1021.5
2042	2022-23	Political Science	1005.0
1948	2022-23	Evening & Weekend MBA	919.0
2043	2022-23	Psychology	760.0
1901	2022-23	Chemistry	691.0
2057	2022-23	Sociology	663.0
1881	2022-23	Architecture	604.5
1900	2022-23	Chemical Engineering	595.0
1889	2022-23	Bioengineering	576.0
1912	2022-23	Cognitive Science	505.5
1879	2022-23	Applied Mathematics	497.0

	Major Short Nm	Student Headcount
150	Letters & Sci Undeclared	101418.0
77	Electrical Eng & Comp Sci	18431.0
137	L&S Computer Science	14818.0
33	Business Administration	14302.5
72	Economics	14000.0
216	Political Science	10334.0
176	Mechanical Engineering	10193.5
149	Law (JD)	9645.5
95	Evening & Weekend MBA	7932.5
233	Sociology	6719.5
217	Psychology	6533.0
40	Chemical Engineering	6126.5
41	Chemistry	5941.5
13	Architecture	5680.5
85	English	5520.0
187	Molecular & Cell Biology	5431.0
125	Info & Data Science-MIDS	5105.0
26	Bioengineering	5061.0
128	Integrative Biology	5060.5
211	Physics	5055.0

Lecture 1 – Data 100, Fall 2023¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

Exploratory Data Analysis on `names` dataset¶

What is the most common first letter in names? What is its distribution?¶

What is the distribution of the length of names?¶

How many records do we have?¶

Understanding the structure of data¶

What is the meaning of the Role field?¶

Exploratory Data Analysis on `majors` dataset¶

Summarizing the Data¶

4. Analysis: Understanding the World¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

1. New Questions¶

How could we answer this question?¶

We don't have the data.¶

UC Berkeley Student Headcount by Major and Demographics¶

2. Acquire data programmatically¶

3. Exploratory Data Analysis on Campus Data¶

What is the historical distribution of graduate and undergraduate students at Berkeley?¶

4.1. Ratio between graduate and undergraduate students in Data 100, and its comparison with campus distribution¶

4.2. Proportion of different majors in Data 100, and their historical emrollment trends¶

	Majors	Terms in Attendance
0	Letters & Sci Undeclared UG	3
1	Data Science BA	7
2	Economics BA	8
3	Letters & Sci Undeclared UG	5
4	Economics BA	7
5	Computer Science BA, Economics BA	7
6	Business Administration BS, Letters & Sci Unde...	5
7	Letters & Sci Undeclared UG	5
8	Microbial Biology BS	5
9	Letters & Sci Undeclared UG	8
10	Letters & Sci Undeclared UG	3
11	Letters & Sci Undeclared UG	5
12	Data Science BA, Society and Environment BS	5
13	Letters & Sci Undeclared UG	3
14	Letters & Sci Undeclared UG, Microbial Biology BS	5
15	Letters & Sci Undeclared UG, Nutritional Scien...	5
16	Letters & Sci Undeclared UG	3
17	Chemistry BS	5
18	Applied Mathematics BA	7
19	Letters & Sci Undeclared UG	3

	Name	Role
0	Emily	Student
1	Zoe	Student
2	Michelle	Student
3	JAMES	Student
4	Jenny	Student

	Majors	Terms in Attendance
count	1290	1290
unique	183	7
top	Letters & Sci Undeclared UG	5
freq	353	539

	count
Majors
Letters & Sci Undeclared UG	353
Data Science BA	129
Computer Science BA	124
Electrical Eng & Comp Sci BS	69
Economics BA	65
Cognitive Science BA	29
Electrical Eng & Comp Sci MEng	25
Applied Mathematics BA	24
Civil Engineering BS	22
Mol Sci & Software Engin MMSSE	19
Business Administration BS, Letters & Sci Undeclared UG	19
Data Science BA, Economics BA	17
Chemical Engineering BS	15
Development Engineering MDE	13
Public Policy MPP	12
Computer Science BA, Data Science BA	11
Info Mgmt & Systems MIMS	10
Molecular & Cell Biology BA	10
Bioengineering BS	9
Public Health BA	8

Lecture 1 – Data 100, Fall 2023¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

Exploratory Data Analysis on names dataset¶

What is the most common first letter in names? What is its distribution?¶

What is the distribution of the length of names?¶

How many records do we have?¶

Understanding the structure of data¶

What is the meaning of the Role field?¶

Exploratory Data Analysis on majors dataset¶

Summarizing the Data¶

4. Analysis: Understanding the World¶

What are your majors?¶

We will often use visualizations to make sense of data¶

What year are you?¶

1. New Questions¶

How could we answer this question?¶

We don't have the data.¶

UC Berkeley Student Headcount by Major and Demographics¶

2. Acquire data programmatically¶

3. Exploratory Data Analysis on Campus Data¶

What is the historical distribution of graduate and undergraduate students at Berkeley?¶

4.1. Ratio between graduate and undergraduate students in Data 100, and its comparison with campus distribution¶

4.2. Proportion of different majors in Data 100, and their historical emrollment trends¶

Exploratory Data Analysis on `names` dataset¶

Exploratory Data Analysis on `majors` dataset¶