import warnings
warnings.filterwarnings("ignore") # For sns styling-related warnings
top_10_views()

bottom_10_views()

top_10_rating()

bottom_10_rating()

series_length()

import kagglehub
path = kagglehub.dataset_download("bharatnatrayn/movies-dataset-for-feature-extracion-prediction")
print("Path to dataset:", path)

Using Colab cache for faster access to the 'movies-dataset-for-feature-extracion-prediction' dataset.
Path to dataset: /kaggle/input/movies-dataset-for-feature-extracion-prediction

import pandas as pd
movies = pd.read_csv(f"{path}/movies.csv")
movies.head()

movies.describe()

movies.duplicated().sum()

np.int64(431)

movies = movies.drop_duplicates()

movies['GENRE'][0]

'\nAction, Horror, Thriller            '

movies['STARS'][0]

'\n    Director:\nPeter Thorwarth\n| \n    Stars:\nPeri Baumeister, \nCarl Anton Koch, \nAlexander Scheer, \nKais Setti\n'

movies['ONE-LINE'][0]

'\nA woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.'

movies_format = movies.copy()
cols = ['GENRE','STARS','ONE-LINE']
for col in cols:
    movies_format[col] = movies_format[col].str.replace('\n','').str.strip()

print(movies_format['GENRE'][0])
print(movies_format['STARS'][0])
print(movies_format['ONE-LINE'][0])

Action, Horror, Thriller
Director:Peter Thorwarth|     Stars:Peri Baumeister, Carl Anton Koch, Alexander Scheer, Kais Setti
A woman with a mysterious illness is forced into action when a group of terrorists attempt to hijack a transatlantic overnight flight.

movies['YEAR'][0:3]

movies_year = movies_format.copy()
movies_year['YEAR'] = movies_year['YEAR'].str.replace('(', '').str.replace(')', '').str.strip()
movies_year['YEAR'][0:3]

movies_year[['Start_Year', 'End_Year']] = (movies_year['YEAR'].str.split('–', expand=True))
movies_year['Start_Year'] = pd.to_numeric(movies_year['Start_Year'], errors='coerce')
movies_year['End_Year'] = pd.to_numeric(movies_year['End_Year'], errors='coerce')

movies_year['Is_Series'] = movies_year['YEAR'].str.contains('–', na=False)
movies_year['Is_Ongoing'] = (movies_year['Is_Series'] & movies_year['End_Year'].isna())

movies_year.loc[movies_year["Is_Series"].eq(False), "End_Year"] = movies_year["Start_Year"]
movies_year.loc[movies_year["Is_Series"] & movies_year["Is_Ongoing"], "End_Year"] = movies_year["Start_Year"]

movies_year.drop(columns=['YEAR'], inplace=True)

movies_year = movies_year.drop(columns=['ONE-LINE'])

movies_year['VOTES'] = movies_year['VOTES'].str.replace(',', '', regex=False)
movies_year['VOTES'] = pd.to_numeric(movies_year['VOTES'], errors='coerce')

movies_star = movies_year.copy()
movies_star['Director'] = movies_star['STARS'].str.extract(r'Director:\s*([^|]+)')
movies_star['Director'] = movies_star['Director'].str.strip()
movies_star.loc[movies_star['Director'].isna(), 'Director'] = 'NA'

# Removing director
movies_star['Stars_Clean'] = movies_star['STARS'].str.replace(r'Director:.*?\|', '', regex=True)

# Removing 'Stars:'
movies_star['Stars_Clean'] = movies_star['Stars_Clean'].str.replace('Stars:', '', regex=False).str.strip()
movies_star['Stars_List'] = movies_star['Stars_Clean'].str.split(',')

# Stripping whitespace
movies_star['Stars_List'] = movies_star['Stars_List'].apply(
    lambda x: [actor.strip() for actor in x] if isinstance(x, list) else x
)
movies_star.drop(columns=['STARS','Stars_Clean'], inplace=True)

movies_star['Star_Count'] = movies_star['Stars_List'].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

movies_star.head()

movies_star['GENRE'] = movies_star['GENRE'].str.split(',')

# whitespace
movies_star['GENRE'] = movies_star['GENRE'].apply(
    lambda x: [genre.strip() for genre in x] if isinstance(x, list) else x
)

movies_genre = movies_star.copy()

movies_genre = movies_genre.explode('GENRE')
movies_genre.head()

import seaborn as sns
import matplotlib.pyplot as plt

genre_counts = movies_genre['GENRE'].value_counts().reset_index()
genre_counts.columns = ['Genre', 'Count']

plt.figure(figsize=(10,6))
sns.barplot(
    data=genre_counts,
    x='Genre',
    y='Count',
    palette='Set2'
)

plt.xticks(rotation=45)
plt.title('Frequency of Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

/tmp/ipython-input-3438722267.py:8: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

movies_star.isnull().sum()

movies_star['GENRE'] = movies_star['GENRE'].fillna('Unknown')

movies_star = movies_star.dropna(subset=['RATING', 'VOTES'])

movies_star.groupby('Is_Series')['RunTime'].mean()

movies_star.loc[(movies_star['Is_Series'] == False) & (movies_star['RunTime'].isna()), 'RunTime'] = 90.492119 # Movies
movies_star.loc[(movies_star['Is_Series'] == True) & (movies_star['RunTime'].isna()), 'RunTime'] = 39.310864 # Shows

movies_clean = movies_star.copy()
movies_clean.drop(columns=['Gross'], inplace=True)

# Condition mask
mask = movies_clean['Start_Year'].isna() & movies_year['End_Year'].isna()
movies_clean.loc[mask, ['Start_Year', 'End_Year']] = 0


# Condition: Start_Year is NA and End_Year is not NA
mask = movies_clean['Start_Year'].isna() & movies_clean['End_Year'].notna()
movies_clean.loc[mask, 'Start_Year'] = movies_clean.loc[mask, 'End_Year']

plt.figure(figsize=(6, 4))
movies_clean[['Start_Year', 'End_Year']].boxplot()
plt.xticks(rotation=45)
plt.title("Start & End Year Outlier Detection")
plt.show()

avg_rating = movies_clean.groupby("Is_Series")["RATING"].mean()
avg_rating.index = avg_rating.index.map({True: 'Series', False: 'Movie'})

plt.figure(figsize=(6,4))
ax = sns.barplot(x=avg_rating.index, y=avg_rating.values)

plt.title("Average Rating: Movies vs Series")
plt.ylabel("Average Rating")
plt.xlabel("Content Type")

for i, v in enumerate(avg_rating.values):
    ax.text(i, v + 0.05, f"{v:.2f}", ha='center')

plt.show()

series_df = movies_clean[movies_clean['Is_Series'] == True]
ongoing_counts = series_df['Is_Ongoing'].value_counts()
ongoing_counts.index = ongoing_counts.index.map({True: 'Ongoing', False: 'Completed'})

plt.figure(figsize=(7,5))
ax = sns.barplot(x=ongoing_counts.index,
                 y=ongoing_counts.values,
                 hue=ongoing_counts.index,
                 legend=False,
                 palette="Set3")
plt.title("Ongoing vs Completed Series")
plt.xlabel("Series Status")
plt.ylabel("Count")

for i, value in enumerate(ongoing_counts.values):
    ax.text(i,
            value + 1,
            str(value),
            ha='center',
            va='bottom')

plt.show()

plt.figure()
sns.histplot(movies_clean["RATING"].dropna(), bins=20, kde=True)
plt.title("Distribution of Audience Ratings")
plt.xlabel("Rating")
plt.ylabel("Count of Titles")
plt.show()

plt.figure(figsize=(8,6))
sns.scatterplot(data=movies_format, x="VOTES", y="RATING")
plt.xscale("log")
plt.title("Rating vs Votes")
plt.show()

correlation = movies_clean[["RATING", "VOTES"]].corr()
print("Correlation between Rating and Votes:")
print(correlation)

Correlation between Rating and Votes:
          RATING     VOTES
RATING  1.000000  0.103792
VOTES   0.103792  1.000000

corr_runtime_rating = movies_clean[["RunTime", "RATING"]].corr()
corr_runtime_rating

top_runtime_records = movies_clean.sort_values(by='RunTime', ascending=False).head(1)

top_runtime_records

movies_clean[movies_clean['Is_Series'] == True].head()

movies_clean.head(10)

decade_avg = (
    movies_clean
        .loc[movies_clean["Start_Year"] != 0]
        .assign(Decade=(movies_clean["Start_Year"] // 10) * 10)
        .groupby("Decade", as_index=False)["RATING"]
        .mean()
        .sort_values("Decade")
)

plt.figure(figsize=(10, 6))

sns.lineplot(data=decade_avg, x="Decade", y="RATING", marker="o")

plt.title("Average Rating by Decade")
plt.xlabel("Decade")
plt.ylabel("Average Rating")
plt.ylim(5.5, 7.5)
plt.tight_layout()
plt.show()

year_count = movies_clean["Start_Year"].loc[movies_clean["Start_Year"] != 0].value_counts().sort_index()

plt.figure(figsize=(10,6))
year_count.plot()
plt.title("Movies over Years")
plt.show()

year_count = movies_clean["Start_Year"].loc[movies_clean["Start_Year"] >1995].value_counts().sort_index()

plt.figure(figsize=(10,6))
year_count.plot()
plt.title("Number of Movies Featured")
plt.show()

df = movies_clean.copy()
df["Content_Type"] = df["Is_Series"].map({
    False: "Movies",
    True: "Series"
})
performance = df.groupby("Content_Type").agg({
    "RATING":"mean",
    "VOTES":"mean"
}).reset_index()

plt.figure(figsize=(8,5))
sns.barplot(data=performance, x="Content_Type", y="VOTES")
plt.title("Average Votes: Movies vs Series")
plt.xlabel("Content Type")
plt.ylabel("Average Votes")
plt.show()

performance

def top_10_views():
  df_exploded = df.explode("GENRE")

  genre_votes_sum = (
      df_exploded
      .groupby("GENRE")
      .agg({
          "VOTES": "sum",
          "RATING": "mean",
          "MOVIES": "count"
      })
      .sort_values("VOTES", ascending=False)
  )

  top_genres_votes = genre_votes_sum.head(10)

  plt.figure(figsize=(9,6))
  sns.barplot(
      x=top_genres_votes["VOTES"],
      y=top_genres_votes.index,
      palette="Set2"
  )

  plt.title("Top 10 Genres by Total Audience Engagement")
  plt.xlabel("Total Votes")
  plt.ylabel("Genre")
  plt.show()

  global x
  x = top_genres_votes

top_10_views()
x

/tmp/ipython-input-2598965299.py:18: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

def bottom_10_views():
  genre_votes_sum = (
      df_exploded
      .groupby("GENRE")
      .agg({
          "VOTES": "sum",
          "RATING": "mean",
          "MOVIES": "count"
      })
      .sort_values("VOTES", ascending=True)
  )

  top_genres_votes = genre_votes_sum.head(11).iloc[1:]

  plt.figure(figsize=(9,6))
  sns.barplot(
      x=top_genres_votes["VOTES"],
      y=top_genres_votes.index,
      palette="Set2"
  )

  plt.title("Bottom 10 Genres by Total Audience Engagement")
  plt.xlabel("Total Votes")
  plt.ylabel("Genre")
  plt.show()

  global x
  x = top_genres_votes

bottom_10_views()
x

/tmp/ipython-input-4038548646.py:16: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

def top_10_rating():
  df_exploded = df.explode("GENRE")

  genre_perf_rating = (
      df_exploded
      .groupby("GENRE")
      .agg({
          "RATING": "mean",
          "VOTES": "mean"
      })
      .sort_values("RATING", ascending=False)
  )

  top_genres_rating = genre_perf_rating.head(10)

  plt.figure(figsize=(8,6))
  sns.barplot(
      x=top_genres_rating["RATING"],
      y=top_genres_rating.index,
      palette="Set2"
  )

  plt.title("Top 10 Genres by Average Rating")
  plt.xlabel("Average Rating")
  plt.ylabel("Genre")
  plt.xlim(7, 7.5)
  plt.show()

  global x
  x = top_genres_rating

top_10_rating()
x

/tmp/ipython-input-3877282698.py:17: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(

def bottom_10_rating():
  genre_perf_rating = (
      df_exploded
      .groupby("GENRE")
      .agg({
          "RATING": "mean",
          "VOTES": "mean"
      })
      .sort_values("RATING", ascending=True)
  )

  top_genres_rating = genre_perf_rating.head(11)

  plt.figure(figsize=(8,6))
  sns.barplot(
      x=top_genres_rating["RATING"],
      y=top_genres_rating.index,
      palette="Set2"
  )

  plt.title("Bottom 10 Genres by Average Rating")
  plt.xlabel("Average Rating")
  plt.ylabel("Genre")
  plt.xlim(5, 7)
  plt.show()

  global x
  x = top_genres_rating

bottom_10_rating()
x

def series_length():
  length_perf = completed_series.groupby("Length_Category").agg({
      "RATING": "mean",
      "VOTES": "sum"
  }).reset_index()

  sns.set_style("whitegrid")
  sns.set_palette("Set2")

  fig, axes = plt.subplots(1, 2, figsize=(12,5))

  # Rating Plot
  ax1 = sns.barplot(
      data=length_perf,
      x="Length_Category",
      y="RATING",
      ax=axes[0]
  )

  axes[0].set_title("Average Rating by Series Length",
                    fontsize=13)
  axes[0].set_xlabel("Series Length")
  axes[0].set_ylabel("Average Rating")
  axes[0].set_ylim(7.4, 8.2)

  for i, v in enumerate(length_perf["RATING"]):
      axes[0].text(i, v + 0.02, f"{v:.2f}",
                  ha='center', fontsize=11)

  # Votes Plot
  ax2 = sns.barplot(
      data=length_perf,
      x="Length_Category",
      y="VOTES",
      ax=axes[1]
  )

  axes[1].set_yscale("log")
  axes[1].set_title("Total Audience Engagement by Series Length (Log Scale)",
                    fontsize=13)
  axes[1].set_xlabel("Series Length")
  axes[1].set_ylabel("Total Votes (log scale)")

  for i, v in enumerate(length_perf["VOTES"]):
      axes[1].text(i, v,
                  f"{v/1e6:.1f}M",
                  ha='center',
                  va='bottom',
                  fontsize=11)

  plt.suptitle("Impact of Series Longevity on Performance",
              fontsize=16)

  plt.tight_layout()
  plt.show()

series_length()

/tmp/ipython-input-3968155922.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  length_perf = completed_series.groupby("Length_Category").agg({

	MOVIES	YEAR	GENRE	RATING	ONE-LINE	STARS	VOTES	RunTime	Gross
0	Blood Red Sky	(2021)	\nAction, Horror, Thriller	6.1	\nA woman with a mysterious illness is forced ...	\n Director:\nPeter Thorwarth\n\| \n Star...	21,062	121.0	NaN
1	Masters of the Universe: Revelation	(2021– )	\nAnimation, Action, Adventure	5.0	\nThe war for Eternia begins again in what may...	\n \n Stars:\nChris Wood, \nSara...	17,870	25.0	NaN
2	The Walking Dead	(2010–2022)	\nDrama, Horror, Thriller	8.2	\nSheriff Deputy Rick Grimes wakes up from a c...	\n \n Stars:\nAndrew Lincoln, \n...	885,805	44.0	NaN
3	Rick and Morty	(2013– )	\nAnimation, Adventure, Comedy	9.2	\nAn animated series that follows the exploits...	\n \n Stars:\nJustin Roiland, \n...	414,849	23.0	NaN
4	Army of Thieves	(2021)	\nAction, Crime, Horror	NaN	\nA prequel, set before the events of Army of ...	\n Director:\nMatthias Schweighöfer\n\| \n ...	NaN	NaN	NaN

	RATING	RunTime
count	8179.000000	7041.000000
mean	6.921176	68.688539
std	1.220232	47.258056
min	1.100000	1.000000
25%	6.200000	36.000000
50%	7.100000	60.000000
75%	7.800000	95.000000
max	9.900000	853.000000

	YEAR
0	(2021)
1	(2021– )
2	(2010–2022)

	YEAR
0	2021
1	2021–
2	2010–2022

	MOVIES	GENRE	RATING	VOTES	RunTime	Gross	Start_Year	End_Year	Is_Series	Is_Ongoing	Director	Stars_List	Star_Count
0	Blood Red Sky	Action, Horror, Thriller	6.1	21062.0	121.0	NaN	2021.0	2021.0	False	False	Peter Thorwarth	[Peri Baumeister, Carl Anton Koch, Alexander S...	4
1	Masters of the Universe: Revelation	Animation, Action, Adventure	5.0	17870.0	25.0	NaN	2021.0	2021.0	True	True	NA	[Chris Wood, Sarah Michelle Gellar, Lena Heade...	4
2	The Walking Dead	Drama, Horror, Thriller	8.2	885805.0	44.0	NaN	2010.0	2022.0	True	False	NA	[Andrew Lincoln, Norman Reedus, Melissa McBrid...	4
3	Rick and Morty	Animation, Adventure, Comedy	9.2	414849.0	23.0	NaN	2013.0	2013.0	True	True	NA	[Justin Roiland, Chris Parnell, Spencer Gramme...	4
4	Army of Thieves	Action, Crime, Horror	NaN	NaN	NaN	NaN	2021.0	2021.0	False	False	Matthias Schweighöfer	[Matthias Schweighöfer, Nathalie Emmanuel, Rub...	4

IMDb 10,000 Netflix Movies and Shows¶

Table of contents¶

Overview¶

Introduction¶

Required Libraries¶

The Problem Domain¶

Step 1: Shaping Goal¶

Step 2: Checking the data¶

Step 3: Tidying the data¶

1. Handling Duplicates¶

2. String Formatting Error¶

3. Inconsistent Year Formats¶

4. `ONE-LINE` & `Votes` Column¶

5. Unstructured STARS Column¶

Extracting Director Column¶

Extracting Stars as Column¶

Creating Star Count Feature¶

6. Restructuring `GENRE` Column¶

7. Missing Values¶

Reasons for Missingness¶

8. Outliers¶

Visualization¶

Dataset¶

Average Runtime: Movies vs Series¶

Average Rating: Movies vs Series¶

Ongoing vs Completed Series¶

Distribution of Audience Rating¶

Popularity vs Rating¶

Runtime vs Rating¶

Insights¶

Movies vs Series¶

Genres¶

By Popularity¶

By Average Rating¶

Do Longer Series Perform Better?¶

Conclusions¶

	MOVIES	GENRE	RATING	VOTES	RunTime	Gross	Start_Year	End_Year	Is_Series	Is_Ongoing	Director	Stars_List	Star_Count
0	Blood Red Sky	Action	6.1	21062.0	121.0	NaN	2021.0	2021.0	False	False	Peter Thorwarth	[Peri Baumeister, Carl Anton Koch, Alexander S...	4
0	Blood Red Sky	Horror	6.1	21062.0	121.0	NaN	2021.0	2021.0	False	False	Peter Thorwarth	[Peri Baumeister, Carl Anton Koch, Alexander S...	4
0	Blood Red Sky	Thriller	6.1	21062.0	121.0	NaN	2021.0	2021.0	False	False	Peter Thorwarth	[Peri Baumeister, Carl Anton Koch, Alexander S...	4
1	Masters of the Universe: Revelation	Animation	5.0	17870.0	25.0	NaN	2021.0	2021.0	True	True	NA	[Chris Wood, Sarah Michelle Gellar, Lena Heade...	4
1	Masters of the Universe: Revelation	Action	5.0	17870.0	25.0	NaN	2021.0	2021.0	True	True	NA	[Chris Wood, Sarah Michelle Gellar, Lena Heade...	4

	MOVIES	GENRE	RATING	VOTES	RunTime	Start_Year	End_Year	Is_Series	Is_Ongoing	Director	Stars_List	Star_Count
1	Masters of the Universe: Revelation	[Animation, Action, Adventure]	5.0	17870.0	25.0	2021.0	2021.0	True	True	NA	[Chris Wood, Sarah Michelle Gellar, Lena Heade...	4
2	The Walking Dead	[Drama, Horror, Thriller]	8.2	885805.0	44.0	2010.0	2022.0	True	False	NA	[Andrew Lincoln, Norman Reedus, Melissa McBrid...	4
3	Rick and Morty	[Animation, Adventure, Comedy]	9.2	414849.0	23.0	2013.0	2013.0	True	True	NA	[Justin Roiland, Chris Parnell, Spencer Gramme...	4
5	Outer Banks	[Action, Crime, Drama]	7.6	25858.0	50.0	2020.0	2020.0	True	True	NA	[Chase Stokes, Madelyn Cline, Madison Bailey, ...	4
7	Dexter	[Crime, Drama, Mystery]	8.6	665387.0	53.0	2006.0	2013.0	True	False	NA	[Michael C. Hall, Jennifer Carpenter, David Za...	4

	MOVIES	GENRE	RATING	VOTES	RunTime	Start_Year	End_Year	Is_Series	Is_Ongoing	Director	Stars_List	Star_Count
0	Blood Red Sky	[Action, Horror, Thriller]	6.1	21062.0	121.0	2021.0	2021.0	False	False	Peter Thorwarth	[Peri Baumeister, Carl Anton Koch, Alexander S...	4
1	Masters of the Universe: Revelation	[Animation, Action, Adventure]	5.0	17870.0	25.0	2021.0	2021.0	True	True	NA	[Chris Wood, Sarah Michelle Gellar, Lena Heade...	4
2	The Walking Dead	[Drama, Horror, Thriller]	8.2	885805.0	44.0	2010.0	2022.0	True	False	NA	[Andrew Lincoln, Norman Reedus, Melissa McBrid...	4
3	Rick and Morty	[Animation, Adventure, Comedy]	9.2	414849.0	23.0	2013.0	2013.0	True	True	NA	[Justin Roiland, Chris Parnell, Spencer Gramme...	4
5	Outer Banks	[Action, Crime, Drama]	7.6	25858.0	50.0	2020.0	2020.0	True	True	NA	[Chase Stokes, Madelyn Cline, Madison Bailey, ...	4
6	The Last Letter from Your Lover	[Drama, Romance]	6.8	5283.0	110.0	2021.0	2021.0	False	False	Augustine Frizzell	[Shailene Woodley, Joe Alwyn, Wendy Nottingham...	4
7	Dexter	[Crime, Drama, Mystery]	8.6	665387.0	53.0	2006.0	2013.0	True	False	NA	[Michael C. Hall, Jennifer Carpenter, David Za...	4
8	Never Have I Ever	[Comedy]	7.9	34530.0	30.0	2020.0	2020.0	True	True	NA	[Maitreyi Ramakrishnan, Poorna Jagannathan, Da...	4
9	Virgin River	[Drama, Romance]	7.4	27279.0	44.0	2019.0	2019.0	True	True	NA	[Alexandra Breckenridge, Martin Henderson, Col...	4
10	Gunpowder Milkshake	[Action, Adventure, Thriller]	6.0	17989.0	114.0	2021.0	2021.0	False	False	Navot Papushado	[Karen Gillan, Lena Headey, Carla Gugino, Mich...	4

	VOTES	RATING	MOVIES
GENRE
Drama	70167787.0	7.091197	3499
Action	43766718.0	7.097396	1843
Adventure	35370287.0	7.295955	1335
Crime	33788787.0	7.080233	1376
Comedy	31948275.0	6.825589	2419
Thriller	19654895.0	6.333719	777
Mystery	13553679.0	7.089417	737
Animation	12697131.0	7.377619	1403
Sci-Fi	12224640.0	6.582593	270
Fantasy	11527266.0	6.996264	455

	VOTES	RATING	MOVIES
GENRE
News	29672.0	7.061111	18
Talk-Show	36987.0	6.960870	23
Game-Show	77194.0	6.369792	96
Film-Noir	154485.0	7.016667	12
Western	258652.0	6.720000	20
Reality-TV	432936.0	6.626437	348
Short	449292.0	6.747753	178
Sport	897016.0	6.852500	160
War	1009873.0	6.986667	45
Musical	1109089.0	7.001923	52

	RATING	VOTES
GENRE
Horror	5.860998	22722.299320
Thriller	6.333719	25295.875161
Game-Show	6.369792	804.104167
Unknown	6.563636	86.909091
Sci-Fi	6.582593	45276.444444
Reality-TV	6.626437	1244.068966
Western	6.720000	12932.600000
Short	6.747753	2524.112360
Family	6.774933	11522.326146
Romance	6.800786	14382.218873
Comedy	6.825589	13207.224060

	Content_Type	RATING	VOTES
0	Movies	6.489209	18840.200046
1	Series	7.415999	10883.643121

IMDb 10,000 Netflix Movies and Shows¶

Table of contents¶

Overview¶

Introduction¶

Required Libraries¶

The Problem Domain¶

Step 1: Shaping Goal¶

Step 2: Checking the data¶

Step 3: Tidying the data¶

1. Handling Duplicates¶

2. String Formatting Error¶

3. Inconsistent Year Formats¶

4. ONE-LINE & Votes Column¶

5. Unstructured STARS Column¶

Extracting Director Column¶

Extracting Stars as Column¶

Creating Star Count Feature¶

6. Restructuring GENRE Column¶

7. Missing Values¶

Reasons for Missingness¶

8. Outliers¶

Visualization¶

Dataset¶

Average Runtime: Movies vs Series¶

Average Rating: Movies vs Series¶

Ongoing vs Completed Series¶

Distribution of Audience Rating¶

Popularity vs Rating¶

Runtime vs Rating¶

Insights¶

Movies vs Series¶

Genres¶

By Popularity¶

By Average Rating¶

Do Longer Series Perform Better?¶

Conclusions¶

4. `ONE-LINE` & `Votes` Column¶

6. Restructuring `GENRE` Column¶