Python 自动化爬取网易云音乐歌手歌词实战教程

张

张建站

2026/4/28 21:28:21

10分钟阅读

网易云音乐歌词数据分散于多页面手动复制效率低下、易出现内容遗漏且无法满足批量采集需求。自动化爬取面临两大核心技术难点其一歌词数据通过 AJAX 异步动态加载原生font stylecolor:rgb(0, 0, 0);background-color:rgba(0, 0, 0, 0);requests/font仅能获取静态空壳 HTML无法直接解析有效数据其二平台反爬机制严苛高频请求易触发 403 访问拦截、滑块验证等限制。本文基于 Python 构建端到端企业级歌词爬取系统覆盖 API 逆向分析、请求参数加密、请求头伪装、异常容错、本地持久化存储全流程并集成亿牛云爬虫代理高效解决 IP 封禁问题实现稳定、批量的歌手歌词采集。一、环境依赖配置各库核心作用font stylecolor:rgb(0, 0, 0);background-color:rgba(0, 0, 0, 0);requests/font高性能 HTTP 请求客户端负责发送网络请求、获取接口响应数据font stylecolor:rgb(0, 0, 0);background-color:rgba(0, 0, 0, 0);demjson3/font兼容非标准 JSON 格式解析适配网易云音乐 API 非常规响应数据font stylecolor:rgb(0, 0, 0);background-color:rgba(0, 0, 0, 0);cryptography/font提供 AES 对称加密能力用于生成平台接口必需的加密参数二、API 逆向加密参数生成网易云音乐后端接口采用参数加密校验机制是数据爬取的核心技术壁垒请求参数需经过加密处理后才能正常调用。核心加密参数说明表格参数名功能说明生成规则params封装业务请求参数歌曲 ID、时间戳等AES-CBC 模式加密 Base64 编码encSecKey加密密钥校验参数随机生成 16 位十六进制字符串nonce防重放随机数随机生成 16 位十六进制字符串加密实现代码python运行import base64 import random import json from Crypto.Cipher import AES from Crypto.Util.Padding import pad def generate_encrypted_params(params): 网易云音乐API加密参数生成函数 :param params: 原始业务参数字典 :return: 加密后可直接用于请求的参数 # 生成随机密钥与随机数 enc_sec_key random.randbytes(16).hex()[:16] nonce random.randbytes(16).hex()[:16] # 业务参数序列化 params_json json.dumps(params) # 网易云音乐固定加密密钥与偏移量 key b0CoJUmKQw8gw8ig iv b0102030405060708 # AES-CBC加密 Base64编码 cipher AES.new(key, AES.MODE_CBC, iv) encrypted_data cipher.encrypt(pad(params_json.encode(utf-8), AES.block_size)) encrypted_params_b64 base64.b64encode(encrypted_data).decode(utf-8) return { params: encrypted_params_b64, encSecKey: enc_sec_key, nonce: nonce }三、歌词接口请求封装网易云音乐标准歌词 API 接口font stylecolor:rgb(0, 0, 0);background-color:rgba(0, 0, 0, 0);https://music.163.com/weapi/song/lyric?csrf_token/font基于面向对象思想封装爬虫核心类实现请求伪装、代理集成、异常处理一体化python运行import requests import random class NetEaseMusicCrawler: def __init__(self, use_proxyFalse, proxy_configNone): self.base_url https://music.163.com self.use_proxy use_proxy self.proxy_config proxy_config # 模拟浏览器请求头绕过基础反爬 self.headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36, Referer: https://music.163.com/, Accept: */*, Accept-Language: zh-CN,zh;q0.9,en;q0.8, Connection: close } def get_lyric(self, song_id): 单首歌曲歌词获取 :param song_id: 歌曲唯一标识ID :return: 原始歌词文本 / None # 构造业务参数 params {id: song_id, lv: -1, tv: -1, csrf_token: } encrypted_params generate_encrypted_params(params) url f{self.base_url}/weapi/song/lyric?csrf_token # 代理配置 proxies self._get_proxies() try: # 发送POST请求 resp requests.post( url, dataencrypted_params, headersself.headers, proxiesproxies, timeout10 ) # 状态码容错处理 if resp.status_code 200: return self._parse_lyric(resp.text) elif resp.status_code 429: print(f请求频繁(429)建议延长请求间隔) elif resp.status_code 403: print(f访问被拦截(403)建议切换IP或更新请求头) return None except Exception as e: print(f请求异常: {str(e)}) return None def _parse_lyric(self, response_text): 非标准JSON歌词数据解析 try: data demjson3.decode(response_text) return data.get(lrc, {}).get(lyric, ) if data.get(code) 200 else None except Exception: return None def _get_proxies(self): 代理获取工具方法 if not self.use_proxy or not self.proxy_config: return None proxy_meta http://%(user)s:%(pass)s%(host)s:%(port)s % self.proxy_config proxies {http: proxy_meta, https: proxy_meta} self.headers[Proxy-Tunnel] str(random.randint(1, 10000)) return proxies四、批量爬取歌手全量歌曲通过歌手 ID 获取热门歌曲列表实现批量歌词自动化下载与本地存储python运行import os import time def get_artist_songs(self, artist_id): 获取歌手热门歌曲列表单次最多50首 url f{self.base_url}/weapi/artist/top/song params {id: artist_id, offset: 0, limit: 50, total: True} encrypted_params generate_encrypted_params(params) proxies self._get_proxies() try: resp requests.post(url, dataencrypted_params, headersself.headers, proxiesproxies, timeout10) if resp.status_code 200: data demjson3.decode(resp.text) return data.get(songs, []) if data.get(code) 200 else [] except Exception: return [] return [] def batch_download_lyrics(self, artist_id, save_dirnetease_lyrics): 批量下载歌手歌词 :param artist_id: 歌手ID :param save_dir: 歌词保存目录 os.makedirs(save_dir, exist_okTrue) songs self.get_artist_songs(artist_id) print(f成功获取{len(songs)}首歌曲) success_count 0 for song in songs: song_id song.get(id) song_name song.get(name, 未知歌曲) artist_name song.get(ar, [{}])[0].get(name, 未知歌手) print(f正在下载: {artist_name} - {song_name}) lyric self.get_lyric(song_id) if lyric: # 过滤文件名非法字符避免保存失败 valid_filename .join([c for c in f{artist_name}-{song_name} if c.isalnum() or c in ( , -, _)]) filepath os.path.join(save_dir, f{valid_filename}.lrc) with open(filepath, w, encodingutf-8) as f: f.write(lyric) print(f ✓ 保存成功) success_count 1 else: print(f ✗ 下载失败) # 控制请求频率规避反爬 time.sleep(random.uniform(1, 3)) print(f\n任务完成成功下载{success_count}/{len(songs)}首歌词) return success_count # 绑定方法到类 NetEaseMusicCrawler.get_artist_songs get_artist_songs NetEaseMusicCrawler.batch_download_lyrics batch_download_lyrics五、代理 IP 集成与反爬规避网易云音乐对单 IP 请求频率、请求总量实施严格限制高频访问会直接触发滑块验证、IP 永久封禁。亿牛云爬虫代理通过动态 IP 池技术可有效分散请求来源突破反爬限制。代理配置与启动示例python运行def main(): # 亿牛云隧道代理配置 proxy_config { host: t.16yun.cn, port: 31111, username: your_username, password: your_password } # 初始化爬虫开启代理模式 crawler NetEaseMusicCrawler(use_proxyTrue, proxy_configproxy_config) # 批量爬取歌词示例周杰伦歌手ID6452 crawler.batch_download_lyrics(artist_id6452, save_dirnetease_lyrics) if __name__ __main__: main()代理核心优势隧道代理技术固定代理入口每次请求自动分配独立出口 IP海量 IP 资源标准版 IP 池 30 万加强版 80 万高性能网络延迟低至 100ms支持毫秒级 IP 切换高并发QPS 上限 5-300 次 / 秒适配批量采集场景六、边界场景处理与性能优化文件名合法性校验歌曲名常包含font stylecolor:rgb(0, 0, 0);background-color:rgba(0, 0, 0, 0);/ \ : * ?/font等系统非法字符需过滤后再保存文件HTTPS IP 粘性问题HTTPS 请求默认存在连接复用添加font stylecolor:rgb(0, 0, 0);background-color:rgba(0, 0, 0, 0);Connection: Close/font请求头可强制切换 IP异常容错新增网络超时、解析失败、空数据等场景的降级处理提升系统稳定性七、完整可运行代码整合所有模块提供开箱即用的完整实现python运行import requests import random import os import time import json import base64 from Crypto.Cipher import AES from Crypto.Util.Padding import pad import demjson3 def generate_encrypted_params(params): enc_sec_key random.randbytes(16).hex()[:16] nonce random.randbytes(16).hex()[:16] params_json json.dumps(params) key b0CoJUmKQw8gw8ig iv b0102030405060708 cipher AES.new(key, AES.MODE_CBC, iv) encrypted_data cipher.encrypt(pad(params_json.encode(utf-8), AES.block_size)) encrypted_params_b64 base64.b64encode(encrypted_data).decode(utf-8) return {params: encrypted_params_b64, encSecKey: enc_sec_key, nonce: nonce} class NetEaseMusicCrawler: def __init__(self, use_proxyFalse, proxy_configNone): self.base_url https://music.163.com self.use_proxy use_proxy self.proxy_config proxy_config self.headers { User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36, Referer: https://music.163.com/, Accept: */*, Accept-Language: zh-CN,zh;q0.9,en;q0.8, Connection: close } def _get_proxies(self): if not self.use_proxy or not self.proxy_config: return None proxy_meta http://%(user)s:%(pass)s%(host)s:%(port)s % self.proxy_config proxies {http: proxy_meta, https: proxy_meta} self.headers[Proxy-Tunnel] str(random.randint(1, 10000)) return proxies def get_lyric(self, song_id): params {id: song_id, lv: -1, tv: -1, csrf_token: } encrypted_params generate_encrypted_params(params) url f{self.base_url}/weapi/song/lyric?csrf_token proxies self._get_proxies() try: resp requests.post(url, dataencrypted_params, headersself.headers, proxiesproxies, timeout10) if resp.status_code 200: return self._parse_lyric(resp.text) return None except: return None def _parse_lyric(self, response_text): try: data demjson3.decode(response_text) return data.get(lrc, {}).get(lyric, ) if data.get(code) 200 else None except: return None def get_artist_songs(self, artist_id): url f{self.base_url}/weapi/artist/top/song params {id: artist_id, offset: 0, limit: 50, total: True} encrypted_params generate_encrypted_params(params) proxies self._get_proxies() try: resp requests.post(url, dataencrypted_params, headersself.headers, proxiesproxies, timeout10) if resp.status_code 200: data demjson3.decode(resp.text) return data.get(songs, []) if data.get(code) 200 else [] except: return [] return [] def batch_download_lyrics(self, artist_id, save_dirnetease_lyrics): os.makedirs(save_dir, exist_okTrue) songs self.get_artist_songs(artist_id) success_count 0 for song in songs: song_id song.get(id) song_name song.get(name, 未知) artist_name song.get(ar, [{}])[0].get(name, 未知) lyric self.get_lyric(song_id) if lyric: valid_fn .join([c for c in f{artist_name}-{song_name} if c.isalnum() or c in ( , -, _)]) with open(os.path.join(save_dir, f{valid_fn}.lrc), w, encodingutf-8) as f: f.write(lyric) success_count 1 time.sleep(random.uniform(1, 3)) print(f完成{success_count}/{len(songs)}) def main(): proxy_config {host: t.16yun.cn, port: 31111, username: your_user, password: your_pwd} crawler NetEaseMusicCrawler(use_proxyTrue, proxy_configproxy_config) crawler.batch_download_lyrics(6452) if __name__ __main__: main()