Я тестировал URL-адрес, используя python, и получил результат от str
itdUrlforrip.text content: http://itdmusic.in/category/new-releases/page/4
полный код
#!/usr/bin/python
import requests
import re
import regex
from pyquery import PyQuery
#get each
link1 = open('/Users/R/Downloads/itdUrlforrip.txt','r').read()
list1 = link1.split('\n')
list2 = []
for eachlink1 in list1:
linkSub1 = requests.get(eachlink1).text
splitContent = linkSub1.split("Facebook")
splitContent1 = splitContent[0]
list2.append(splitContent1)
list2GLStr = ("\n".join(list2))
urlAll = regex.findall('itdmusic\.in\/\d\d\/.+\.html', list2GLStr)
allUrlrmDup1 = list(dict.fromkeys(urlAll))
#get list of url from input
allUrlrmDup1Ah = regex.sub('itdmusic', 'http://itdmusic', str(allUrlrmDup1))
allUrlrmDup1Ah2 = regex.sub('\'', '', str(allUrlrmDup1Ah))
allUrlrmDup1Ah3 = regex.sub('\[', '', str(allUrlrmDup1Ah2))
allUrlrmDup1Ah4 = regex.sub('\]', '', str(allUrlrmDup1Ah3))
allUrlrmDup1AhGL = ("\n".join(list(allUrlrmDup1Ah4.split(', '))))
allUrlrmDup1AhList = allUrlrmDup1AhGL.split('\n')
list3 = []
list4 = []
for eachlink2 in allUrlrmDup1AhList:
linkSub2 = requests.get(eachlink2).text
urlGdr = regex.findall('drive\.google\.com\/.{41}', linkSub2)
urlOth = regex.findall('https\:\/\/www\d\d\d\.zippyshare\.com\/v.{19}|https\:\/\/www\d\d\.zippyshare\.com\/v.{19}|https\:\/\/www\d\.zippyshare\.com\/v.{19}|https?:\/\/douploads\.com\/.{12}|https?:\/\/www\.mirrored\.to\/.{14}|https?:\/\/mir\.cr\/.{8}|https?:\/\/hexupload\.net\/.{12}|https?:\/\/intoupload\.net\/.{12}|https?:\/\/www\.dropbox\.com\/s\/.{15}|https?:\/\/dbree\.org\/v\/.{6}|https?:\/\/dropapk\.to\/.{12}|https?:\/\/www\.sendspace\.com\/file\/.{6}|https?:\/\/gestyy\.com\/.{6}|https?:\/\/ouo\.io\/\w{6}|https?:\/\/mega\.nz.{55}|https?:\/\/bit\.ly.{8}', linkSub2)
urlska = regex.findall('https?\:\/\/itdmusic\.in\/skipads\/.+\/\'', linkSub2)
urlskaStr = str(urlska)
urlska2 = regex.sub('\/\'', '', urlskaStr)
list3.append(urlGdr)
list3.append(urlOth)
list4.append(urlska2)
, затем I
print(list4)
и результат
'[]', '[]', '[]', '[]', '[]', '[]', '[]', '[]', '["http://itdmusic.in/skipads/2020/03/12/luke-bryan-one-margarita-pre-single"]', '["http://itdmusic.in/skipads/2020/03/12/kota-banks-italiana-single"]'
для 32 с
, так что есть ли способ получить избавиться от '[]' и просто получить здесь URL? Я пробую кучу вещей и до сих пор не могу понять с помощью регулярных выражений и re. Я немного запутался, используя для ххх в ххх.