From 8615a75c5971d25d564c5e46a73a01087eaa32f4 Mon Sep 17 00:00:00 2001
From: amritsingh047 <68320490+amritsingh047@users.noreply.github.com>
Date: Fri, 31 Oct 2025 19:22:52 +0530
Subject: [PATCH] Add YouTube music data analysis script

This script analyzes YouTube music hits data, calculating engagement rates and visualizing trends by genre and artist. It includes data loading, cleaning, and exploratory data analysis.
---
 Youtube music | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 Youtube music

diff --git a/Youtube music b/Youtube music
new file mode 100644
index 000000000..26610b3dc
--- /dev/null
+++ b/Youtube music	
@@ -0,0 +1,98 @@
+# Which ten songs have the highest calculated engagement rate (Likes to Views ratio), indicating the highest viewer appreciation?"
+# 1. Import necessary libraries
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Set visualization style and parameters
+sns.set_style('dark')
+plt.rcParams['figure.figsize'] = (12, 7)
+
+# --- 2. Data Loading ---
+# NOTE: Replace 'youtube_hits_2025.csv' with your actual file path.
+try:
+    file_path = 'youtube_hits_2025.csv'
+    df = pd.read_csv(file_path)
+    print("YouTube Music Hits data successfully loaded!")
+except FileNotFoundError:
+    print(f"Error: File '{file_path}' not found. Creating a dummy DataFrame for demonstration.")
+    # Create a minimal dummy DataFrame for structural demonstration
+    data = {
+        'Title': ['Song A', 'Song B', 'Song C', 'Song D', 'Song E'],
+        'Artist': ['Artist 1', 'Artist 2', 'Artist 1', 'Artist 3', 'Artist 4'],
+        'Genre': ['Pop', 'Rock', 'Pop', 'Hip Hop', 'Pop'],
+        'Views': [1500000000, 500000000, 1800000000, 900000000, 1200000000],
+        'Likes': [15000000, 5000000, 18000000, 9000000, 12000000],
+        'Comments': [500000, 100000, 600000, 300000, 400000],
+        'Release_Date': ['2024-11-20', '2025-01-15', '2024-12-01', '2025-02-10', '2025-03-05']
+    }
+    df = pd.DataFrame(data)
+
+# Initial Data Exploration
+print("\n--- Initial Data Info ---")
+print(df.head())
+print(df.info())
+
+
+# --- 3. Data Cleaning and Feature Engineering ---
+
+# 3.1. Convert 'Release_Date' to datetime and extract time features
+df['Release_Date'] = pd.to_datetime(df['Release_Date'])
+df['Release_Month'] = df['Release_Date'].dt.month_name()
+
+# 3.2. Calculate Engagement Rate (Likes to Views Ratio)
+# A key metric for content quality, excluding spam views
+df['Engagement_Rate'] = (df['Likes'] / df['Views']) * 100
+df.sort_values(by='Engagement_Rate', ascending=False, inplace=True)
+
+
+# --- 4. Exploratory Data Analysis (EDA) & Insights ---
+
+# 4.1. Top 10 Most Viewed Songs
+top_views = df.sort_values(by='Views', ascending=False).head(10)
+print("\n--- 4.1 Top 10 Most Viewed Songs ---")
+print(top_views[['Title', 'Artist', 'Views']])
+
+# 4.2. Top 10 Songs by Engagement Rate
+top_engagement = df.sort_values(by='Engagement_Rate', ascending=False).head(10)
+print("\n--- 4.2 Top 10 Songs by Engagement Rate (%) ---")
+print(top_engagement[['Title', 'Artist', 'Engagement_Rate']])
+
+
+# 4.3. Genre Popularity and Performance (Total Views)
+genre_views = df.groupby('Genre')['Views'].sum().sort_values(ascending=False)
+print("\n--- 4.3 Total Views by Genre ---")
+print(genre_views)
+
+plt.figure(figsize=(10, 6))
+sns.barplot(x=genre_views.index, y=genre_views.values, palette='plasma')
+plt.title('Total Views by Music Genre ')
+plt.ylabel('Total Views (in Billions)')
+plt.xlabel('Genre')
+plt.xticks(rotation=45, ha='right')
+plt.tight_layout()
+plt.show()
+
+
+# 4.4. Artist Dominance (Number of Hits)
+artist_hits = df['Artist'].value_counts().head(10)
+print("\n--- 4.4 Top 10 Artists by Number of Hits ---")
+print(artist_hits)
+
+plt.figure(figsize=(10, 6))
+sns.barplot(y=artist_hits.index, x=artist_hits.values, palette='viridis')
+plt.title('Top 10 Artists by Count of Songs in the Top 100')
+plt.xlabel('Number of Hits')
+plt.ylabel('Artist')
+plt.show()
+
+# 4.5. Monthly Release Trend
+monthly_releases = df['Release_Month'].value_counts()
+
+plt.figure(figsize=(10, 6))
+sns.lineplot(x=monthly_releases.index, y=monthly_releases.values, marker='o', linewidth=3, color='red')
+plt.title('Distribution of Top 100 Releases by Month')
+plt.ylabel('Number of Songs Released')
+plt.xlabel('Month')
+plt.show()