# Linear algebra, probability
import numpy as np

# Data manipulation
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Interactive visualization library
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px

# pd stands for pandas, which we will learn starting in the next lecture.
# Some pandas syntax is similar to data8's datascience package.
majors = pd.read_csv("data/majors.csv")
names = pd.read_csv("data/names.csv")

# Let's peek at the first 20 rows of the majors dataframe.
majors.head(20)

# Let's peek at the first 5 rows (default) of the names dataframe.
names.head()

names['Name'] = names['Name'].str.lower()

names.head()

print(len(names))
print(len(majors))

1206
1206

names["Role"].value_counts()

Role
Student             1182
Waitlist Student      23
#REF!                  1
Name: count, dtype: int64

# Boolean index to find rows where Role is #REF!
names[names['Name'] == "#ref!"]

names = names[names['Name'] != "#ref!"]

names['Role'].value_counts().to_frame()  # Again, counts of unique Roles.

names['Name'].value_counts().to_frame()   # Counting the frequency of each unique name.

majors.columns   # Get column names

Index(['Majors', 'Terms in Attendance'], dtype='object')

majors['Terms in Attendance'].value_counts().to_frame()

majors[majors['Terms in Attendance'] == "#REF!"]

majors = majors[majors['Terms in Attendance'] != "#REF!"]
majors['Terms in Attendance'].value_counts().to_frame()

names.describe()

majors.describe()

majors_count = (       # Method chaining in pandas
    majors['Majors']
    .value_counts()
    .sort_values(ascending=False) # Highest first
    .to_frame()
    .head(20)          # Get the top 20
)

majors_count

# Interactive using plotly
fig = px.bar(majors_count.loc[::-1], orientation='h')
fig.update_layout(showlegend=False, 
                  xaxis_title='Count', 
                  yaxis_title='Major',
                  autosize=False, 
                  width=800, 
                  height=500)

fig = px.histogram(majors['Terms in Attendance'].sort_values(),
                   histnorm='probability')

fig.update_layout(showlegend=False,
                  xaxis_title="Term",
                  yaxis_title="Fraction of Class",
                  autosize=False, 
                  width=800, 
                  height=250)

# Replacing terms in attendance data with the degree objective
majors.loc[majors.loc[:, 'Terms in Attendance'] != 'G', 'Terms in Attendance'] = 'Undergraduate'
majors.loc[majors.loc[:, 'Terms in Attendance'] == 'G', 'Terms in Attendance'] = 'Graduate'

majors.rename(columns={'Terms in Attendance': 'Ungrad Grad'}, inplace=True)

majors.describe()

url = "https://docs.google.com/spreadsheets/d/1J7tz3GQLs3M6hFseJCE9KhjVhe4vKga8Q2ezu0oG5sQ/gviz/tq?tqx=out:csv"

university_majors = pd.read_csv(url, 
                                usecols = ['Academic Yr', 'Semester', 'Ungrad Grad', 
                                           'Entry Status', 'Major Short Nm', 'Student Headcount'])

# Examining the data
university_majors

# Reporting student data based on academic year
university_majors = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad', 'Entry Status', 'Major Short Nm'], as_index = False)[["Student Headcount"]]
                     .mean()
                    )

university_majors

university_grad_vs_ungrd = (university_majors.groupby(
    ['Academic Yr', 'Ungrad Grad'], as_index = False)[["Student Headcount"]]
                            .sum()
                           )

proportions = university_grad_vs_ungrd.pivot(index='Academic Yr', columns='Ungrad Grad', values='Student Headcount')
proportions['Total'] = proportions['Undergraduate'] + proportions['Graduate']
proportions['Undergrad Proportion'] = proportions['Undergraduate'] / proportions['Total']
proportions['Grad Proportion'] = proportions['Graduate'] / proportions['Total']


fig = px.bar(proportions.reset_index(), 
             x='Academic Yr', 
             y=['Undergraduate', 'Graduate'],
             title='Number of Grad vs. Undergrad Students',
             labels={'value': 'Number of Students'},
             color_discrete_map={'Undergraduate': 'blue', 'Graduate': 'orange'})

fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()

data100_grad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Graduate'].count()

data100_undergrad = majors['Ungrad Grad'].loc[majors['Ungrad Grad'] == 'Undergraduate'].count()

print("Number of graduate students in Data 100: ", data100_grad)
print("Number of undergraduate students in Data 100: ", data100_undergrad)

Number of graduate students in Data 100:  113
Number of undergraduate students in Data 100:  1092

data100_row = {'Graduate':[data100_grad], 
               'Undergraduate':[data100_undergrad], 
               'Total':[data100_grad + data100_undergrad], 
               'Undergrad Proportion':[data100_undergrad / (data100_grad + data100_undergrad)],
               'Grad Proportion':[data100_grad / (data100_grad + data100_undergrad)], 
               }

new_row_df = pd.DataFrame(data100_row)

proportions.loc['Data 100'] = new_row_df.iloc[0]


fig = px.bar(proportions.reset_index(), 
             x='Academic Yr', 
             y=['Undergrad Proportion', 'Grad Proportion'],
             title='Proportions of Grad vs. Undergrad Students',
             labels={'value': 'Proportion'},
             color_discrete_map={'Undergrad Proportion': 'blue', 'Grad Proportion': 'orange'})


fig.update_layout(barmode='relative', autosize=False, width=800, height=600)
fig.show()

	Majors	Terms in Attendance
0	Computer Science BA, Linguistics BA	8
1	Civil Engineering BS	6
2	Economics BA	7
3	Environ Econ & Policy BS, Letters & Sci Undecl...	8
4	Letters & Sci Undeclared UG	4
5	Letters & Sci Undeclared UG	4
6	Data Science BA, Economics BA	6
7	Economics BA	8
8	Letters & Sci Undeclared UG	4
9	Applied Mathematics BA	6
10	Letters & Sci Undeclared UG	4
11	Letters & Sci Undeclared UG	4
12	Letters & Sci Undeclared UG	8
13	Economics BA	8
14	Letters & Sci Undeclared UG	4
15	Environmental Sciences BS	8
16	Chemistry PhD	G
17	Civil Engineering BS	6
18	Molecular & Cell Biology BA	6
19	Applied Mathematics BA	6

	count
Name
daniel	10
kevin	9
joshua	8
ashley	8
ryan	8
...	...
akili	1
madeline	1
sharaf	1
gauri	1
zhixuan	1

	count
Terms in Attendance
4	399
6	379
8	189
G	113
2	85
—	16
7	15
5	6
3	2
U	1
#REF!	1

	count
Terms in Attendance
4	399
6	379
8	189
G	113
2	85
—	16
7	15
5	6
3	2
U	1

	count
Majors
Letters & Sci Undeclared UG	294
Data Science BA	123
Civil Engineering BS	82
Economics BA	50
Computer Science BA	46
Applied Mathematics BA	42
Electrical Eng & Comp Sci BS	42
Cognitive Science BA	41
Statistics BA	38
Data Science BA, Letters & Sci Undeclared UG	33
Mol Sci & Software Engin MMSSE	32
Molecular & Cell Biology BA	20
UCBX Concurrent International	16
Civil & Env Eng Prof MS	14
Business Administration BS	14
Computer Science BA, Letters & Sci Undeclared UG	13
Environmental Sciences BS	12
Electrical Eng & Comp Sci MEng	12
Applied Mathematics BA, Computer Science BA	10
Letters & Sci Undeclared UG, Statistics BA	10

Lecture 1 – Data 100, Spring 2025¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

Exploratory Data Analysis on `names` dataset¶

How many records do we have?¶

Understanding the structure of data¶

Most Frequent Names¶

Exploratory Data Analysis on `majors` dataset¶

Summarizing the Data¶

4. Analysis: Understanding the World¶

What are your majors?¶

We will often use visualizations to make sense of the data¶

What year are you?¶

1. New Question¶

How could we answer this question?¶

We don't have the data.¶

UC Berkeley Student Headcount by Major and Demographics¶

2. Acquire Data Programmatically¶

3. Exploratory Data Analysis on Campus Data¶

What is the historical distribution of graduate and undergraduate students at Berkeley?¶

4. Analysis¶

	Name	Role
0	Daniel	Student
1	Michael	Student
2	Harry	Student
3	SAMUEL	Student
4	Kyle	Student

	Majors	Terms in Attendance
count	1205	1205
unique	154	10
top	Letters & Sci Undeclared UG	4
freq	294	399

	Academic Yr	Semester	Ungrad Grad	Entry Status	Major Short Nm	Student Headcount
0	2014-15	Fall	Graduate	Graduate	Education	335
1	2014-15	Fall	Graduate	Graduate	Educational Leadership Jnt Pgm	1
2	2014-15	Fall	Graduate	Graduate	Special Education	18
3	2014-15	Fall	Graduate	Graduate	Science & Math Education	15
4	2014-15	Fall	Graduate	Graduate	Chemical Engineering	136
...	...	...	...	...	...	...
7199	2023-24	Spring	Undergraduate	Transfer Entrant	Nut Sci-Physio & Metabol	13
7200	2023-24	Spring	Undergraduate	Transfer Entrant	Nutritional Sci-Dietetics	1
7201	2023-24	Spring	Undergraduate	Transfer Entrant	Nutritional Sci-Toxicology	2
7202	2023-24	Spring	Undergraduate	Transfer Entrant	Genetics & Plant Biology	11
7203	2023-24	Spring	Undergraduate	Transfer Entrant	Microbial Biology	39

Lecture 1 – Data 100, Spring 2025¶

Software Packages¶

1. Starting with a Question: Who are you (the students of Data 100)?¶

2. Data Acquisition and Cleaning¶

3. Exploratory Data Analysis¶

Peeking at the Data¶

What is one potential issue we may need to address in this data?¶

Exploratory Data Analysis on names dataset¶

How many records do we have?¶

Understanding the structure of data¶

Most Frequent Names¶

Exploratory Data Analysis on majors dataset¶

Summarizing the Data¶

4. Analysis: Understanding the World¶

What are your majors?¶

We will often use visualizations to make sense of the data¶

What year are you?¶

1. New Question¶

How could we answer this question?¶

We don't have the data.¶

UC Berkeley Student Headcount by Major and Demographics¶

2. Acquire Data Programmatically¶

3. Exploratory Data Analysis on Campus Data¶

What is the historical distribution of graduate and undergraduate students at Berkeley?¶

4. Analysis¶

Exploratory Data Analysis on `names` dataset¶

Exploratory Data Analysis on `majors` dataset¶