Scraping Websites With Protected Content Using Pyqt5

July 02, 2024 Post a Comment

I am try to scrape content from a dynamic website that requires login. I found this piece of code that works for PyQt4 Scraping Javascript driven web pages with PyQt4 - how to acce

Solution 1:

You have to use QWebEnginePage so the tasks are asynchronous as I obtained from the HTML, also QtWebEngine does not use QNetworkRequest so you must use QWebEngineHttpRequest:

import sys

from PyQt5.QtCore import QByteArray, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from PyQt5.QtWebEngineWidgets import QWebEnginePage


class Render(QWebEnginePage):
    def __init__(self, url):
        app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.loadFinished.connect(self._loadFinished)

        self._html = ""

        username = "username"
        password = "password"
        base64string = QByteArray(("%s:%s" % (username, password)).encode()).toBase64()
        request = QWebEngineHttpRequest(QUrl.fromUserInput(url))
        equest.setHeader(b"Authorization", b"Basic: %s" % (base64string,))

        self.load(request)

        app.exec_()

    @property
    def html(self):
        return self._html

    def _loadFinished(self):
        self.toHtml(self.handle_to_html)

    def handle_to_html(self, html):
        self._html = html
        QApplication.quit()


def main():
    url = "http://www.google.com"
    r = Render(url)
    print(r.html)


if __name__ == "__main__":
    main()

Python Guru

Scraping Websites With Protected Content Using Pyqt5

Solution 1:

Post a Comment for "Scraping Websites With Protected Content Using Pyqt5"