diff --git a/Youtube music b/Youtube music new file mode 100644 index 000000000..26610b3dc --- /dev/null +++ b/Youtube music @@ -0,0 +1,98 @@ +# Which ten songs have the highest calculated engagement rate (Likes to Views ratio), indicating the highest viewer appreciation?" +# 1. Import necessary libraries +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +# Set visualization style and parameters +sns.set_style('dark') +plt.rcParams['figure.figsize'] = (12, 7) + +# --- 2. Data Loading --- +# NOTE: Replace 'youtube_hits_2025.csv' with your actual file path. +try: + file_path = 'youtube_hits_2025.csv' + df = pd.read_csv(file_path) + print("YouTube Music Hits data successfully loaded!") +except FileNotFoundError: + print(f"Error: File '{file_path}' not found. Creating a dummy DataFrame for demonstration.") + # Create a minimal dummy DataFrame for structural demonstration + data = { + 'Title': ['Song A', 'Song B', 'Song C', 'Song D', 'Song E'], + 'Artist': ['Artist 1', 'Artist 2', 'Artist 1', 'Artist 3', 'Artist 4'], + 'Genre': ['Pop', 'Rock', 'Pop', 'Hip Hop', 'Pop'], + 'Views': [1500000000, 500000000, 1800000000, 900000000, 1200000000], + 'Likes': [15000000, 5000000, 18000000, 9000000, 12000000], + 'Comments': [500000, 100000, 600000, 300000, 400000], + 'Release_Date': ['2024-11-20', '2025-01-15', '2024-12-01', '2025-02-10', '2025-03-05'] + } + df = pd.DataFrame(data) + +# Initial Data Exploration +print("\n--- Initial Data Info ---") +print(df.head()) +print(df.info()) + + +# --- 3. Data Cleaning and Feature Engineering --- + +# 3.1. Convert 'Release_Date' to datetime and extract time features +df['Release_Date'] = pd.to_datetime(df['Release_Date']) +df['Release_Month'] = df['Release_Date'].dt.month_name() + +# 3.2. Calculate Engagement Rate (Likes to Views Ratio) +# A key metric for content quality, excluding spam views +df['Engagement_Rate'] = (df['Likes'] / df['Views']) * 100 +df.sort_values(by='Engagement_Rate', ascending=False, inplace=True) + + +# --- 4. Exploratory Data Analysis (EDA) & Insights --- + +# 4.1. Top 10 Most Viewed Songs +top_views = df.sort_values(by='Views', ascending=False).head(10) +print("\n--- 4.1 Top 10 Most Viewed Songs ---") +print(top_views[['Title', 'Artist', 'Views']]) + +# 4.2. Top 10 Songs by Engagement Rate +top_engagement = df.sort_values(by='Engagement_Rate', ascending=False).head(10) +print("\n--- 4.2 Top 10 Songs by Engagement Rate (%) ---") +print(top_engagement[['Title', 'Artist', 'Engagement_Rate']]) + + +# 4.3. Genre Popularity and Performance (Total Views) +genre_views = df.groupby('Genre')['Views'].sum().sort_values(ascending=False) +print("\n--- 4.3 Total Views by Genre ---") +print(genre_views) + +plt.figure(figsize=(10, 6)) +sns.barplot(x=genre_views.index, y=genre_views.values, palette='plasma') +plt.title('Total Views by Music Genre ') +plt.ylabel('Total Views (in Billions)') +plt.xlabel('Genre') +plt.xticks(rotation=45, ha='right') +plt.tight_layout() +plt.show() + + +# 4.4. Artist Dominance (Number of Hits) +artist_hits = df['Artist'].value_counts().head(10) +print("\n--- 4.4 Top 10 Artists by Number of Hits ---") +print(artist_hits) + +plt.figure(figsize=(10, 6)) +sns.barplot(y=artist_hits.index, x=artist_hits.values, palette='viridis') +plt.title('Top 10 Artists by Count of Songs in the Top 100') +plt.xlabel('Number of Hits') +plt.ylabel('Artist') +plt.show() + +# 4.5. Monthly Release Trend +monthly_releases = df['Release_Month'].value_counts() + +plt.figure(figsize=(10, 6)) +sns.lineplot(x=monthly_releases.index, y=monthly_releases.values, marker='o', linewidth=3, color='red') +plt.title('Distribution of Top 100 Releases by Month') +plt.ylabel('Number of Songs Released') +plt.xlabel('Month') +plt.show()