{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import urllib.parse\n", "from random import shuffle\n", "from my_rymscraper.rymscraper import rymscraper, RymUrl\n", "network = rymscraper.RymNetwork()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameArtistTypeReleasedRecordedRYM RatingRankedGenresDescriptorsLanguageTrack listingColorscheme
0Room 25NonameAlbum14 September 2018July-September 20183.59\\n \\n/ 5.00.5\\n\\n ...#58 for 2018, #8,307 overallJazz Rap, Conscious Hip Hop\\nNeo-Soul, Jazz Po...female vocals, introspective, conscious, po...English[Self, Blaxploitation, Prayer Song, Window, Do...[#cccccc, #afaeaa, #c9c9c8, #9c9c9a, #b7b7b7, ...
\n", "
" ], "text/plain": [ " Name Artist Type Released Recorded \\\n", "0 Room 25 Noname Album 14 September 2018 July-September 2018 \n", "\n", " RYM Rating \\\n", "0 3.59\\n \\n/ 5.00.5\\n\\n ... \n", "\n", " Ranked \\\n", "0 #58 for 2018, #8,307 overall \n", "\n", " Genres \\\n", "0 Jazz Rap, Conscious Hip Hop\\nNeo-Soul, Jazz Po... \n", "\n", " Descriptors Language \\\n", "0 female vocals, introspective, conscious, po... English \n", "\n", " Track listing \\\n", "0 [Self, Blaxploitation, Prayer Song, Window, Do... \n", "\n", " Colorscheme \n", "0 [#cccccc, #afaeaa, #c9c9c8, #9c9c9a, #b7b7b7, ... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sample Raw Data\n", "pd.DataFrame([network.get_album_infos(url=\"https://rateyourmusic.com/release/album/noname/room-25/\")])" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "mt = pd.read_csv(\"meta.csv\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# mt['tag'] = mt[\"Singer\"] + \" - \" + mt[\"Album_Name\"];\n", "tags = list(zip(mt[\"id\"], mt[\"Singer\"], mt[\"Album_Name\"]))\n", "# tags = [\"Common - A Beautiful Revolution, Vol. 2\"]\n", "cols = ['id', 'Name', 'Artist', 'Released', 'RYM Rating', 'Genres', 'Descriptors',\n", " 'Language', 'Track listing', 'Colorscheme', 'Languages', 'Ranked']\n", "df = pd.DataFrame(columns=cols)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "lower_alnum_dash = lambda s: \"\".join(x for x in s.replace(\" \", \"-\").lower() if (x.isalnum() or x == \"-\" ))\n", "\n", "max_iter = 1 # one iteration is enough for most cases, raise it if errors occur\n", "iter = 0\n", "while iter < max_iter:\n", " new_tags = []\n", " for uid, singer, album in tags:\n", " # Remove dashes first\n", " singer = singer.replace(\"-\", \" \")\n", " album = album.replace(\"-\", \" \")\n", " try:\n", " # Try url first\n", " url = \"https://rateyourmusic.com/release/album/%s/%s/\" % (lower_alnum_dash(singer), lower_alnum_dash(album))\n", " tag = singer + \" - \" + album\n", " album_infos = network.get_album_infos(url=url)\n", " except:\n", " album_infos = []\n", " \n", " if not album_infos:\n", " try:\n", " # Try search then\n", " url_tag = urllib.parse.quote(tag)\n", " album_infos = network.get_album_infos(name=url_tag) # get album by search\n", " except IndexError as e:\n", " print(\"❌ \", tag)\n", " \n", " if not album_infos:\n", " new_tags.append((uid, singer, album))\n", " else:\n", " print(\"✅ \", tag)\n", " album_infos[\"id\"] = uid\n", " df = pd.concat([df, pd.DataFrame([album_infos])], axis=0)\n", " # except IndexError:\n", " # except (IndexError, AttributeError):\n", "\n", " tags = new_tags\n", " iter += 1" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('stevereich2018pultet', 'Steve Reich', 'Pulse/Quartet')]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tags" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdNameArtistReleasedRYM RatingGenresDescriptorsLanguageTrack listingColorschemeLanguagesRanked
count222222212202
unique222222212202
topchristinavantzou2018no4no4Nº4Christina Vantzou6 April 20183.32\\n \\n/ 5.00.5\\n\\n ...Ambient, Drone\\nModern Classical, Dark Ambientdark, minimalistic, atmospheric, sombreEnglish[Glissando for Bodies and Machines in Space, P...[#7a7a7a, #cccccc]NaN#674 for 2018
freq1111111111NaN1
\n", "
" ], "text/plain": [ " Id Name Artist Released \\\n", "count 2 2 2 2 \n", "unique 2 2 2 2 \n", "top christinavantzou2018no4no4 Nº4 Christina Vantzou 6 April 2018 \n", "freq 1 1 1 1 \n", "\n", " RYM Rating \\\n", "count 2 \n", "unique 2 \n", "top 3.32\\n \\n/ 5.00.5\\n\\n ... \n", "freq 1 \n", "\n", " Genres \\\n", "count 2 \n", "unique 2 \n", "top Ambient, Drone\\nModern Classical, Dark Ambient \n", "freq 1 \n", "\n", " Descriptors Language \\\n", "count 2 1 \n", "unique 2 1 \n", "top dark, minimalistic, atmospheric, sombre English \n", "freq 1 1 \n", "\n", " Track listing Colorscheme \\\n", "count 2 2 \n", "unique 2 2 \n", "top [Glissando for Bodies and Machines in Space, P... [#7a7a7a, #cccccc] \n", "freq 1 1 \n", "\n", " Languages Ranked \n", "count 0 2 \n", "unique 0 2 \n", "top NaN #674 for 2018 \n", "freq NaN 1 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xdf = df.reset_index()\n", "xdf = df[cols]\n", "xdf.reset_index()\n", "xdf.describe()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/anaconda3/envs/edav/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "xdf[\"Language\"] = [xdf.iloc[i][\"Languages\"] if pd.isna(xdf.iloc[i][\"Language\"]) else xdf.iloc[i][\"Language\"] for i in range(len(xdf.index))]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdNameArtistReleasedRYM RatingGenresDescriptorsLanguageTrack listingColorschemeRanked
count22222221222
unique22222221222
topchristinavantzou2018no4no4Nº4Christina Vantzou6 April 20183.32\\n \\n/ 5.00.5\\n\\n ...Ambient, Drone\\nModern Classical, Dark Ambientdark, minimalistic, atmospheric, sombreEnglish[Glissando for Bodies and Machines in Space, P...[#7a7a7a, #cccccc]#674 for 2018
freq11111111111
\n", "
" ], "text/plain": [ " Id Name Artist Released \\\n", "count 2 2 2 2 \n", "unique 2 2 2 2 \n", "top christinavantzou2018no4no4 Nº4 Christina Vantzou 6 April 2018 \n", "freq 1 1 1 1 \n", "\n", " RYM Rating \\\n", "count 2 \n", "unique 2 \n", "top 3.32\\n \\n/ 5.00.5\\n\\n ... \n", "freq 1 \n", "\n", " Genres \\\n", "count 2 \n", "unique 2 \n", "top Ambient, Drone\\nModern Classical, Dark Ambient \n", "freq 1 \n", "\n", " Descriptors Language \\\n", "count 2 1 \n", "unique 2 1 \n", "top dark, minimalistic, atmospheric, sombre English \n", "freq 1 1 \n", "\n", " Track listing Colorscheme \\\n", "count 2 2 \n", "unique 2 2 \n", "top [Glissando for Bodies and Machines in Space, P... [#7a7a7a, #cccccc] \n", "freq 1 1 \n", "\n", " Ranked \n", "count 2 \n", "unique 2 \n", "top #674 for 2018 \n", "freq 1 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xdf = xdf[['id', 'Name', 'Artist', 'Released', 'RYM Rating', 'Genres', 'Descriptors',\n", " 'Language', 'Track listing', 'Colorscheme','Ranked']]\n", "xdf.describe()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "import re\n", "ratings = []\n", "num_ratings = []\n", "for i in xdf[\"RYM Rating\"]:\n", " if isinstance(i, float):\n", " ratings.append(\"\")\n", " num_ratings.append(\"\")\n", " else:\n", " ratings.append(float(i.rsplit('\\n')[0]))\n", " num_ = i.rsplit('\\n')[-1]\n", " num = int(re.findall(r\"\\d*,*\\d+\",num_)[0].replace(\",\",\"\"))\n", " num_ratings.append(num)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "xdf[\"RYM Rating\"] = ratings\n", "xdf[\"Num_of_RYM_Ratings\"] = num_ratings" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdNameArtistReleasedRYM RatingGenresDescriptorsLanguageTrack listingColorschemeRankedNum_of_RYM_Ratings
0christinavantzou2018no4no4Nº4Christina Vantzou6 April 20183.32Ambient, Drone\\nModern Classical, Dark Ambientdark, minimalistic, atmospheric, sombreNaN[Glissando for Bodies and Machines in Space, P...[#7a7a7a, #cccccc]#674 for 2018378
0billcallahan2022ytiaerYTI⅃AƎЯBill Callahan14 October 20223.57Americana, Singer-Songwriter\\nAlt-Country, Fol...male vocals, introspective, pastoral, poeti...English[First Bird, Everyway, Bowevil, Partition, Lil...[#a0a5a1, #324856, #4f705f, #734144]#104 for 2022895
\n", "
" ], "text/plain": [ " Id Name Artist Released \\\n", "0 christinavantzou2018no4no4 Nº4 Christina Vantzou 6 April 2018 \n", "0 billcallahan2022ytiaer YTI⅃AƎЯ Bill Callahan 14 October 2022 \n", "\n", " RYM Rating Genres \\\n", "0 3.32 Ambient, Drone\\nModern Classical, Dark Ambient \n", "0 3.57 Americana, Singer-Songwriter\\nAlt-Country, Fol... \n", "\n", " Descriptors Language \\\n", "0 dark, minimalistic, atmospheric, sombre NaN \n", "0 male vocals, introspective, pastoral, poeti... English \n", "\n", " Track listing \\\n", "0 [Glissando for Bodies and Machines in Space, P... \n", "0 [First Bird, Everyway, Bowevil, Partition, Lil... \n", "\n", " Colorscheme Ranked Num_of_RYM_Ratings \n", "0 [#7a7a7a, #cccccc] #674 for 2018 378 \n", "0 [#a0a5a1, #324856, #4f705f, #734144] #104 for 2022 895 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xdf" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "xdf.to_csv(\"rym_add.csv\")" ] } ], "metadata": { "kernelspec": { "display_name": "edav", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "0df9b00d9d230cd9a3bd41555cc6e6dfffa51e74efe856b6a8c4a87056f0c072" } } }, "nbformat": 4, "nbformat_minor": 2 }