From 9030b2567bf04618ba2363fd593ad731fbf6a692 Mon Sep 17 00:00:00 2001 From: gitlost Date: Tue, 8 Jul 2025 22:31:16 +0100 Subject: [PATCH] GeniusLyrics: update to parse latest HTML of returned lyrics, devolving the removal of various crud to `HtmlLyricsProvider`; log initial query and use new `StartsOrEndsMatch()` static to match JSON replies, log each request, and break if full match; `StartsOrEndsMatch()` ignores some common punctuation variations & normalizes single quotes and allows match at beginning or end HtmlLyricsProvider: fix `multiple` mode not to terminate on first batch, and defer processing till have whole HTML (avoids issues with tags spanning batches); add param to take list of regular expressions to remove from HTML prior to general processing (used only by `GeniusLyrics` for now) README.md etc: update list of lyrics providers supported --- README.md | 2 +- debian/control | 2 +- ...rawberrymusicplayer.strawberry.appdata.xml | 2 +- dist/unix/strawberry.1 | 2 +- dist/unix/strawberry.spec.in | 2 +- src/lyrics/geniuslyricsprovider.cpp | 44 +++++++++++++++---- src/lyrics/geniuslyricsprovider.h | 3 ++ src/lyrics/htmllyricsprovider.cpp | 41 ++++++++--------- src/lyrics/htmllyricsprovider.h | 2 +- 9 files changed, 65 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 73d02c22b..9866a8bd7 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Funding developers is a way to contribute to open source projects you appreciate * Edit tags on audio files * Fetch tags from MusicBrainz * Album cover art from [Last.fm](https://www.last.fm/), [Musicbrainz](https://musicbrainz.org/), [Discogs](https://www.discogs.com/), [Musixmatch](https://www.musixmatch.com/), [Deezer](https://www.deezer.com/), [Tidal](https://www.tidal.com/), [Qobuz](https://www.qobuz.com/) and [Spotify](https://www.spotify.com/) - * Song lyrics from [Genius](https://genius.com/), [Musixmatch](https://www.musixmatch.com/), [ChartLyrics](http://www.chartlyrics.com/), [lyrics.ovh](https://lyrics.ovh/), [lololyrics.com](https://www.lololyrics.com/), [songlyrics.com](https://www.songlyrics.com/), [azlyrics.com](https://www.azlyrics.com/) and [elyrics.net](https://www.elyrics.net/) + * Song lyrics from [Genius](https://genius.com/), [Musixmatch](https://www.musixmatch.com/), [ChartLyrics](http://www.chartlyrics.com/), [lyrics.ovh](https://lyrics.ovh/), [lololyrics.com](https://www.lololyrics.com/), [songlyrics.com](https://www.songlyrics.com/), [azlyrics.com](https://www.azlyrics.com/), [elyrics.net](https://www.elyrics.net/), [letras.mus.br](https://www.letras.mus.br) and [LyricFind](https://lyrics.lyricfind.com] * Support for multiple backends * Audio analyzer * Audio equalizer diff --git a/debian/control b/debian/control index c127b74f6..b32a7089c 100644 --- a/debian/control +++ b/debian/control @@ -60,7 +60,7 @@ Description: music player and music collection organizer - Edit tags on audio files - Automatically retrieve tags from MusicBrainz - Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify - - Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com and elyrics.net + - Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind - Audio analyzer - Audio equalizer - Transfer music to mass-storage USB players, MTP compatible devices and iPod Nano/Classic diff --git a/dist/unix/org.strawberrymusicplayer.strawberry.appdata.xml b/dist/unix/org.strawberrymusicplayer.strawberry.appdata.xml index 06d4ec5ad..28acd2171 100644 --- a/dist/unix/org.strawberrymusicplayer.strawberry.appdata.xml +++ b/dist/unix/org.strawberrymusicplayer.strawberry.appdata.xml @@ -31,7 +31,7 @@
  • Edit tags on audio files
  • Automatically retrieve tags from MusicBrainz
  • Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify
  • -
  • Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com and elyrics.net
  • +
  • Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind
  • Audio analyzer and equalizer
  • Transfer music to mass-storage USB players, MTP compatible devices and iPod Nano/Classic
  • Scrobbler with support for Last.fm, Libre.fm and ListenBrainz
  • diff --git a/dist/unix/strawberry.1 b/dist/unix/strawberry.1 index c7cc71a50..14b16b8a0 100644 --- a/dist/unix/strawberry.1 +++ b/dist/unix/strawberry.1 @@ -29,7 +29,7 @@ Features: .br - Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify .br -- Song lyrics from Lyrics.com, Genius, Musixmatch, ChartLyrics, lyrics.ovh and lololyrics.com +- Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind .br - Support for multiple backends .br diff --git a/dist/unix/strawberry.spec.in b/dist/unix/strawberry.spec.in index 2e66cc0a8..d315e7dc9 100644 --- a/dist/unix/strawberry.spec.in +++ b/dist/unix/strawberry.spec.in @@ -93,7 +93,7 @@ Features: - Edit tags on audio files - Automatically retrieve tags from MusicBrainz - Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify - - Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com and elyrics.net + - Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind - Support for multiple backends - Audio analyzer - Audio equalizer diff --git a/src/lyrics/geniuslyricsprovider.cpp b/src/lyrics/geniuslyricsprovider.cpp index 8c25e8b4b..66920aa4e 100644 --- a/src/lyrics/geniuslyricsprovider.cpp +++ b/src/lyrics/geniuslyricsprovider.cpp @@ -148,6 +148,8 @@ void GeniusLyricsProvider::StartSearch(const int id, const LyricsSearchRequest & QNetworkReply *reply = CreateGetRequest(QUrl(QLatin1String(kUrlSearch)), url_query); QObject::connect(reply, &QNetworkReply::finished, this, [this, reply, id]() { HandleSearchReply(reply, id); }); + qLog(Debug) << name_ << "Sending request for" << url_query.query(); + } GeniusLyricsProvider::JsonObjectResult GeniusLyricsProvider::ParseJsonObject(QNetworkReply *reply) { @@ -302,10 +304,8 @@ void GeniusLyricsProvider::HandleSearchReply(QNetworkReply *reply, const int id) const QString artist = primary_artist["name"_L1].toString(); const QString title = object_result["title"_L1].toString(); - // Ignore results where both the artist and title don't match. - if (!artist.startsWith(search->request.albumartist, Qt::CaseInsensitive) && - !artist.startsWith(search->request.artist, Qt::CaseInsensitive) && - !title.startsWith(search->request.title, Qt::CaseInsensitive)) { + // Ignore results where the artist or title don't begin or end the same + if (!StartsOrEndsMatch(artist, search->request.artist) || !StartsOrEndsMatch(title, search->request.title)) { continue; } @@ -323,6 +323,12 @@ void GeniusLyricsProvider::HandleSearchReply(QNetworkReply *reply, const int id) QNetworkReply *new_reply = CreateGetRequest(url); QObject::connect(new_reply, &QNetworkReply::finished, this, [this, new_reply, search, url]() { HandleLyricReply(new_reply, search->id, url); }); + qLog(Debug) << name_ << "Sending request for" << url; + + // If full match, don't bother iterating further + if (artist == search->request.albumartist && artist == search->request.artist && title == search->request.title) { + break; + } } } @@ -363,12 +369,18 @@ void GeniusLyricsProvider::HandleLyricReply(QNetworkReply *reply, const int sear return; } - const QString content = QString::fromUtf8(data); - QString lyrics = HtmlLyricsProvider::ParseLyricsFromHTML(content, QRegularExpression(u"]*>"_s), QRegularExpression(u"<\\/div>"_s), QRegularExpression(u"
    ]+>"_s), true); - if (lyrics.isEmpty()) { - lyrics = HtmlLyricsProvider::ParseLyricsFromHTML(content, QRegularExpression(u"]*>"_s), QRegularExpression(u"<\\/div>"_s), QRegularExpression(u"
    "_s), true); - } + static const QRegularExpression start_tag(u"]*>"_s); + static const QRegularExpression end_tag(u"<\\/div>"_s); + static const QRegularExpression lyrics_start(u"
    ]+>"_s); + static const QRegularExpression regex_html_tag_span_trans(u"]*>[^<]*"_s); + static const QRegularExpression regex_html_tag_div_ellipsis(u"
    ]*>[^<]*
    "_s); + static const QRegularExpression regex_html_tag_span_contribs(u"]*>[^<]*"_s); + static const QRegularExpression regex_html_tag_div_bio(u"
    ]*>.*?
    "_s); + static const QRegularExpression regex_html_tag_h2(u"

    ]*>[^<]*

    "_s); + static const QList regex_removes{ regex_html_tag_span_trans, regex_html_tag_div_ellipsis, regex_html_tag_span_contribs, regex_html_tag_div_bio, regex_html_tag_h2 }; + + const QString lyrics = HtmlLyricsProvider::ParseLyricsFromHTML(QString::fromUtf8(data), start_tag, end_tag, lyrics_start, true, regex_removes); if (!lyrics.isEmpty()) { LyricsSearchResult result(lyrics); result.artist = lyric.artist; @@ -404,3 +416,17 @@ void GeniusLyricsProvider::EndSearch(const int id, const LyricsSearchRequest &re Q_EMIT SearchFinished(id, results); } + +bool GeniusLyricsProvider::StartsOrEndsMatch(QString s, QString t) { + + constexpr Qt::CaseSensitivity cs = Qt::CaseInsensitive; + + static const QRegularExpression puncts_regex(u"[!,.:;]"_s); + static const QRegularExpression quotes_regex(u"[’‘´`]"_s); + + s.remove(puncts_regex).replace(quotes_regex, u"'"_s); + t.remove(puncts_regex).replace(quotes_regex, u"'"_s); + + return (s.compare(t, cs) == 0 && !s.isEmpty()) || (!s.isEmpty() && !t.isEmpty() && (s.startsWith(t, cs) || t.startsWith(s, cs) || s.endsWith(t, cs) || t.endsWith(s, cs))); + +} diff --git a/src/lyrics/geniuslyricsprovider.h b/src/lyrics/geniuslyricsprovider.h index 949067f03..47771c519 100644 --- a/src/lyrics/geniuslyricsprovider.h +++ b/src/lyrics/geniuslyricsprovider.h @@ -79,6 +79,9 @@ class GeniusLyricsProvider : public JsonLyricsProvider { void HandleSearchReply(QNetworkReply *reply, const int id); void HandleLyricReply(QNetworkReply *reply, const int search_id, const QUrl &url); + private: + static bool StartsOrEndsMatch(QString s, QString t); + private: OAuthenticator *oauth_; mutable QMutex mutex_access_token_; diff --git a/src/lyrics/htmllyricsprovider.cpp b/src/lyrics/htmllyricsprovider.cpp index 4402415e2..1b2cbe165 100644 --- a/src/lyrics/htmllyricsprovider.cpp +++ b/src/lyrics/htmllyricsprovider.cpp @@ -109,7 +109,7 @@ void HtmlLyricsProvider::HandleLyricsReply(QNetworkReply *reply, const int id, c } -QString HtmlLyricsProvider::ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple) { +QString HtmlLyricsProvider::ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple, const QList ®ex_removes) { Q_ASSERT(QThread::currentThread() != qApp->thread()); @@ -153,30 +153,31 @@ QString HtmlLyricsProvider::ParseLyricsFromHTML(const QString &content, const QR if (!lyrics.isEmpty()) { lyrics.append(u'\n'); } - static const QRegularExpression regex_html_tag_a(u"]*>[^<]*"_s); - static const QRegularExpression regex_html_tag_script(u""_s); - static const QRegularExpression regex_html_tag_div(u"
    ]*>×
    "_s); - static const QRegularExpression regex_html_tag_br(u"]*>"_s); - static const QRegularExpression regex_html_tag_p_close(u"

    "_s); - static const QRegularExpression regex_html_tags(u"<[^>]*>"_s); - lyrics.append(content.mid(start_lyrics_idx, end_lyrics_idx - start_lyrics_idx) - .remove(u'\r') - .remove(u'\n') - .remove(regex_html_tag_a) - .remove(regex_html_tag_script) - .remove(regex_html_tag_div) - .replace(regex_html_tag_br, u"\n"_s) - .replace(regex_html_tag_p_close, u"\n\n"_s) - .remove(regex_html_tags) - .trimmed()); - } - else { - start_idx = -1; + lyrics.append(content.mid(start_lyrics_idx, end_lyrics_idx - start_lyrics_idx).remove(u'\r').remove(u'\n')); } } while (start_idx > 0 && multiple); + for (auto it = regex_removes.cbegin(); it != regex_removes.cend(); it++) { + lyrics.remove(*it); + } + static const QRegularExpression regex_html_tag_a(u"]*>[^<]*"_s); + static const QRegularExpression regex_html_tag_script(u""_s); + static const QRegularExpression regex_html_tag_div(u"
    ]*>×
    "_s); + static const QRegularExpression regex_html_tag_br(u"]*>"_s); + static const QRegularExpression regex_html_tag_p_close(u"

    "_s); + static const QRegularExpression regex_html_tags(u"<[^>]*>"_s); + static const QRegularExpression regex_newlines_squash(u"\\n{3,}"_s); + lyrics.remove(regex_html_tag_a) + .remove(regex_html_tag_script) + .remove(regex_html_tag_div) + .replace(regex_html_tag_br, u"\n"_s) + .replace(regex_html_tag_p_close, u"\n\n"_s) + .remove(regex_html_tags) + .replace(regex_newlines_squash, u"\n\n"_s); + lyrics = lyrics.trimmed(); + if (lyrics.length() > 6000 || lyrics.contains("there are no lyrics to"_L1, Qt::CaseInsensitive)) { return QString(); } diff --git a/src/lyrics/htmllyricsprovider.h b/src/lyrics/htmllyricsprovider.h index 4022850dd..b8e31148f 100644 --- a/src/lyrics/htmllyricsprovider.h +++ b/src/lyrics/htmllyricsprovider.h @@ -41,7 +41,7 @@ class HtmlLyricsProvider : public LyricsProvider { virtual bool StartSearchAsync(const int id, const LyricsSearchRequest &request) override; - static QString ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple); + static QString ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple, const QList ®ex_removes = {}); protected: virtual QUrl Url(const LyricsSearchRequest &request) = 0;