GeniusLyrics: update to parse latest HTML of returned lyrics,

devolving the removal of various crud to `HtmlLyricsProvider`;
  log initial query and use new `StartsOrEndsMatch()` static to
  match JSON replies, log each request, and break if full match;
  `StartsOrEndsMatch()` ignores some common punctuation variations
   & normalizes single quotes and allows match at beginning or end
HtmlLyricsProvider: fix `multiple` mode not to terminate on first
  batch, and defer processing till have whole HTML (avoids issues
  with tags spanning batches);
  add param to take list of regular expressions to remove from HTML
  prior to general processing (used only by `GeniusLyrics` for now)
README.md etc: update list of lyrics providers supported
This commit is contained in:
gitlost
2025-07-08 22:31:16 +01:00
committed by Jonas Kvinge
parent ee7bb449a5
commit 9030b2567b
9 changed files with 65 additions and 35 deletions

View File

@@ -53,7 +53,7 @@ Funding developers is a way to contribute to open source projects you appreciate
* Edit tags on audio files
* Fetch tags from MusicBrainz
* Album cover art from [Last.fm](https://www.last.fm/), [Musicbrainz](https://musicbrainz.org/), [Discogs](https://www.discogs.com/), [Musixmatch](https://www.musixmatch.com/), [Deezer](https://www.deezer.com/), [Tidal](https://www.tidal.com/), [Qobuz](https://www.qobuz.com/) and [Spotify](https://www.spotify.com/)
* Song lyrics from [Genius](https://genius.com/), [Musixmatch](https://www.musixmatch.com/), [ChartLyrics](http://www.chartlyrics.com/), [lyrics.ovh](https://lyrics.ovh/), [lololyrics.com](https://www.lololyrics.com/), [songlyrics.com](https://www.songlyrics.com/), [azlyrics.com](https://www.azlyrics.com/) and [elyrics.net](https://www.elyrics.net/)
* Song lyrics from [Genius](https://genius.com/), [Musixmatch](https://www.musixmatch.com/), [ChartLyrics](http://www.chartlyrics.com/), [lyrics.ovh](https://lyrics.ovh/), [lololyrics.com](https://www.lololyrics.com/), [songlyrics.com](https://www.songlyrics.com/), [azlyrics.com](https://www.azlyrics.com/), [elyrics.net](https://www.elyrics.net/), [letras.mus.br](https://www.letras.mus.br) and [LyricFind](https://lyrics.lyricfind.com]
* Support for multiple backends
* Audio analyzer
* Audio equalizer

2
debian/control vendored
View File

@@ -60,7 +60,7 @@ Description: music player and music collection organizer
- Edit tags on audio files
- Automatically retrieve tags from MusicBrainz
- Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify
- Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com and elyrics.net
- Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind
- Audio analyzer
- Audio equalizer
- Transfer music to mass-storage USB players, MTP compatible devices and iPod Nano/Classic

View File

@@ -31,7 +31,7 @@
<li>Edit tags on audio files</li>
<li>Automatically retrieve tags from MusicBrainz</li>
<li>Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify</li>
<li>Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com and elyrics.net</li>
<li>Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind</li>
<li>Audio analyzer and equalizer</li>
<li>Transfer music to mass-storage USB players, MTP compatible devices and iPod Nano/Classic</li>
<li>Scrobbler with support for Last.fm, Libre.fm and ListenBrainz</li>

View File

@@ -29,7 +29,7 @@ Features:
.br
- Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify
.br
- Song lyrics from Lyrics.com, Genius, Musixmatch, ChartLyrics, lyrics.ovh and lololyrics.com
- Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind
.br
- Support for multiple backends
.br

View File

@@ -93,7 +93,7 @@ Features:
- Edit tags on audio files
- Automatically retrieve tags from MusicBrainz
- Album cover art from Last.fm, Musicbrainz, Discogs, Musixmatch, Deezer, Tidal, Qobuz and Spotify
- Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com and elyrics.net
- Song lyrics from Genius, Musixmatch, ChartLyrics, lyrics.ovh, lololyrics.com, songlyrics.com, azlyrics.com, elyrics.net, letras.mus.br and LyricFind
- Support for multiple backends
- Audio analyzer
- Audio equalizer

View File

@@ -148,6 +148,8 @@ void GeniusLyricsProvider::StartSearch(const int id, const LyricsSearchRequest &
QNetworkReply *reply = CreateGetRequest(QUrl(QLatin1String(kUrlSearch)), url_query);
QObject::connect(reply, &QNetworkReply::finished, this, [this, reply, id]() { HandleSearchReply(reply, id); });
qLog(Debug) << name_ << "Sending request for" << url_query.query();
}
GeniusLyricsProvider::JsonObjectResult GeniusLyricsProvider::ParseJsonObject(QNetworkReply *reply) {
@@ -302,10 +304,8 @@ void GeniusLyricsProvider::HandleSearchReply(QNetworkReply *reply, const int id)
const QString artist = primary_artist["name"_L1].toString();
const QString title = object_result["title"_L1].toString();
// Ignore results where both the artist and title don't match.
if (!artist.startsWith(search->request.albumartist, Qt::CaseInsensitive) &&
!artist.startsWith(search->request.artist, Qt::CaseInsensitive) &&
!title.startsWith(search->request.title, Qt::CaseInsensitive)) {
// Ignore results where the artist or title don't begin or end the same
if (!StartsOrEndsMatch(artist, search->request.artist) || !StartsOrEndsMatch(title, search->request.title)) {
continue;
}
@@ -323,6 +323,12 @@ void GeniusLyricsProvider::HandleSearchReply(QNetworkReply *reply, const int id)
QNetworkReply *new_reply = CreateGetRequest(url);
QObject::connect(new_reply, &QNetworkReply::finished, this, [this, new_reply, search, url]() { HandleLyricReply(new_reply, search->id, url); });
qLog(Debug) << name_ << "Sending request for" << url;
// If full match, don't bother iterating further
if (artist == search->request.albumartist && artist == search->request.artist && title == search->request.title) {
break;
}
}
}
@@ -363,12 +369,18 @@ void GeniusLyricsProvider::HandleLyricReply(QNetworkReply *reply, const int sear
return;
}
const QString content = QString::fromUtf8(data);
QString lyrics = HtmlLyricsProvider::ParseLyricsFromHTML(content, QRegularExpression(u"<div[^>]*>"_s), QRegularExpression(u"<\\/div>"_s), QRegularExpression(u"<div data-lyrics-container=[^>]+>"_s), true);
if (lyrics.isEmpty()) {
lyrics = HtmlLyricsProvider::ParseLyricsFromHTML(content, QRegularExpression(u"<div[^>]*>"_s), QRegularExpression(u"<\\/div>"_s), QRegularExpression(u"<div class=\"lyrics\">"_s), true);
}
static const QRegularExpression start_tag(u"<div[^>]*>"_s);
static const QRegularExpression end_tag(u"<\\/div>"_s);
static const QRegularExpression lyrics_start(u"<div data-lyrics-container=[^>]+>"_s);
static const QRegularExpression regex_html_tag_span_trans(u"<span class=\"LyricsHeader__Translations[^>]*>[^<]*</span>"_s);
static const QRegularExpression regex_html_tag_div_ellipsis(u"<div class=\"LyricsHeader__TextEllipsis[^>]*>[^<]*</div>"_s);
static const QRegularExpression regex_html_tag_span_contribs(u"<span class=\"ContributorsCreditSong__Contributors[^>]*>[^<]*</span>"_s);
static const QRegularExpression regex_html_tag_div_bio(u"<div class=\"SongBioPreview__Container[^>]*>.*?</div>"_s);
static const QRegularExpression regex_html_tag_h2(u"<h2 [^>]*>[^<]*</h2>"_s);
static const QList<QRegularExpression> regex_removes{ regex_html_tag_span_trans, regex_html_tag_div_ellipsis, regex_html_tag_span_contribs, regex_html_tag_div_bio, regex_html_tag_h2 };
const QString lyrics = HtmlLyricsProvider::ParseLyricsFromHTML(QString::fromUtf8(data), start_tag, end_tag, lyrics_start, true, regex_removes);
if (!lyrics.isEmpty()) {
LyricsSearchResult result(lyrics);
result.artist = lyric.artist;
@@ -404,3 +416,17 @@ void GeniusLyricsProvider::EndSearch(const int id, const LyricsSearchRequest &re
Q_EMIT SearchFinished(id, results);
}
bool GeniusLyricsProvider::StartsOrEndsMatch(QString s, QString t) {
constexpr Qt::CaseSensitivity cs = Qt::CaseInsensitive;
static const QRegularExpression puncts_regex(u"[!,.:;]"_s);
static const QRegularExpression quotes_regex(u"[´`]"_s);
s.remove(puncts_regex).replace(quotes_regex, u"'"_s);
t.remove(puncts_regex).replace(quotes_regex, u"'"_s);
return (s.compare(t, cs) == 0 && !s.isEmpty()) || (!s.isEmpty() && !t.isEmpty() && (s.startsWith(t, cs) || t.startsWith(s, cs) || s.endsWith(t, cs) || t.endsWith(s, cs)));
}

View File

@@ -79,6 +79,9 @@ class GeniusLyricsProvider : public JsonLyricsProvider {
void HandleSearchReply(QNetworkReply *reply, const int id);
void HandleLyricReply(QNetworkReply *reply, const int search_id, const QUrl &url);
private:
static bool StartsOrEndsMatch(QString s, QString t);
private:
OAuthenticator *oauth_;
mutable QMutex mutex_access_token_;

View File

@@ -109,7 +109,7 @@ void HtmlLyricsProvider::HandleLyricsReply(QNetworkReply *reply, const int id, c
}
QString HtmlLyricsProvider::ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple) {
QString HtmlLyricsProvider::ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple, const QList<QRegularExpression> &regex_removes) {
Q_ASSERT(QThread::currentThread() != qApp->thread());
@@ -153,29 +153,30 @@ QString HtmlLyricsProvider::ParseLyricsFromHTML(const QString &content, const QR
if (!lyrics.isEmpty()) {
lyrics.append(u'\n');
}
lyrics.append(content.mid(start_lyrics_idx, end_lyrics_idx - start_lyrics_idx).remove(u'\r').remove(u'\n'));
}
}
while (start_idx > 0 && multiple);
for (auto it = regex_removes.cbegin(); it != regex_removes.cend(); it++) {
lyrics.remove(*it);
}
static const QRegularExpression regex_html_tag_a(u"<a [^>]*>[^<]*</a>"_s);
static const QRegularExpression regex_html_tag_script(u"<script>[^>]*</script>"_s);
static const QRegularExpression regex_html_tag_div(u"<div [^>]*>×</div>"_s);
static const QRegularExpression regex_html_tag_br(u"<br[^>]*>"_s);
static const QRegularExpression regex_html_tag_p_close(u"</p>"_s);
static const QRegularExpression regex_html_tags(u"<[^>]*>"_s);
lyrics.append(content.mid(start_lyrics_idx, end_lyrics_idx - start_lyrics_idx)
.remove(u'\r')
.remove(u'\n')
.remove(regex_html_tag_a)
static const QRegularExpression regex_newlines_squash(u"\\n{3,}"_s);
lyrics.remove(regex_html_tag_a)
.remove(regex_html_tag_script)
.remove(regex_html_tag_div)
.replace(regex_html_tag_br, u"\n"_s)
.replace(regex_html_tag_p_close, u"\n\n"_s)
.remove(regex_html_tags)
.trimmed());
}
else {
start_idx = -1;
}
}
while (start_idx > 0 && multiple);
.replace(regex_newlines_squash, u"\n\n"_s);
lyrics = lyrics.trimmed();
if (lyrics.length() > 6000 || lyrics.contains("there are no lyrics to"_L1, Qt::CaseInsensitive)) {
return QString();

View File

@@ -41,7 +41,7 @@ class HtmlLyricsProvider : public LyricsProvider {
virtual bool StartSearchAsync(const int id, const LyricsSearchRequest &request) override;
static QString ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple);
static QString ParseLyricsFromHTML(const QString &content, const QRegularExpression &start_tag, const QRegularExpression &end_tag, const QRegularExpression &lyrics_start, const bool multiple, const QList<QRegularExpression> &regex_removes = {});
protected:
virtual QUrl Url(const LyricsSearchRequest &request) = 0;