Scraping Websites With Protected Content Using Pyqt5
I am try to scrape content from a dynamic website that requires login. I found this piece of code that works for PyQt4 Scraping Javascript driven web pages with PyQt4 - how to acce
Solution 1:
You have to use QWebEnginePage so the tasks are asynchronous as I obtained from the HTML, also QtWebEngine does not use QNetworkRequest so you must use QWebEngineHttpRequest:
import sys
from PyQt5.QtCore import QByteArray, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from PyQt5.QtWebEngineWidgets import QWebEnginePage
class Render(QWebEnginePage):
def __init__(self, url):
app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self._html = ""
username = "username"
password = "password"
base64string = QByteArray(("%s:%s" % (username, password)).encode()).toBase64()
request = QWebEngineHttpRequest(QUrl.fromUserInput(url))
equest.setHeader(b"Authorization", b"Basic: %s" % (base64string,))
self.load(request)
app.exec_()
@property
def html(self):
return self._html
def _loadFinished(self):
self.toHtml(self.handle_to_html)
def handle_to_html(self, html):
self._html = html
QApplication.quit()
def main():
url = "http://www.google.com"
r = Render(url)
print(r.html)
if __name__ == "__main__":
main()
Post a Comment for "Scraping Websites With Protected Content Using Pyqt5"