ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • Extract reddit's posts and make export.html with Flask framework
    Project using python/Jobs scrapper 2020. 12. 21. 14:03

    main.py

    // main.py
    
    from scrapperReddit import get_subreddits
    
    app = Flask("Job Scrapper", template_folder="./src/templates")
    db={}
    
    subreddits = [
        "javascript",
        "reactjs",
        "reactnative",
        "programming",
        "css",
        "golang",
        "flutter",
        "rust",
        "django",
    ]
    
    @app.route('/')
    def index():
      try:
        return render_template("home.html", subreddits=subreddits)
      except IOError:
        return redirect("/")
    
    @app.route('/reddit')
    def reddit():
      subreddits = []
      try:
        aggregated_subreddits_dict = request.args.to_dict()
        subreddits = get_subreddits(aggregated_subreddits_dict)
        return render_template("reddit.html", subreddits=subreddits)
      except IOError:
        return redirect("/")

       home 화면에 subreddits 변수를 전달한다. home.html을 보면 전달해준 subreddits를 checkbox 형식으로 form을 만든 것을 확인할 수 있다.

       @app.route('/reddit')에서 request.args.to_dict()는 사용자로부터 받은 여러 개의 변수를 dictionary 형태로 받아서 반환한다. 그 다음 scrapperReddit.py에 있는 get_subreddits()로 사용자가 요구한 language의 posts들을 모두 scrap한다. 마지막으로, scrap한 자료들을 reddit.html에 보내면서 reddit.html을 랜더링한다.

    scrapperReddit.py

    import requests
    from bs4 import BeautifulSoup
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36"
    }
    
    
    def get_post(post):
        url = post.find("a", {"class", "SQnoC3ObvgnGjWt90zD9Z"})
        if url:
            article = post.find("h3").text
            votes = post.find("div", {"class", "_1rZYMD_4xY3gRcSS3p8ODO"}).text
            url = url.attrs["href"]
            return {
                "article": article,
                "votes": votes,
                "url": f"https://www.reddit.com/{url}",
            }
    
    
    def get_subreddit(aggregated_subreddit):
        subreddit = dict(language="", posts=[])
        subreddit["language"] = aggregated_subreddit
    
        url = f"https://www.reddit.com/r/{aggregated_subreddit}/top/?t=month"
        result = requests.get(url, headers=headers)
        soup = BeautifulSoup(result.text, "html.parser")
    
        posts_container = soup.find("div", {"class", "rpBJOHq2PR60pnwJlUyP0"})
        posts_list = posts_container.find_all("div", {"class": None}, recursive=False)
    
        for aPost in posts_list:
            extracted_post = get_post(aPost)
            if extracted_post:
                subreddit["posts"].append(extracted_post)
        return subreddit
    
    
    def get_subreddits(aggregated_subreddits_dict):
        subreddits = []
        for aggregated_subreddit in aggregated_subreddits_dict:
            subreddit = get_subreddit(aggregated_subreddit)
            print(subreddit)
            subreddits.append(subreddit)
        return subreddits

    home.html

    <!DOCTYPE html>
    <html>
    <head>
      <title>
        Remote Jobs
      </title>
      <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet">
      </link>
    </head>
    <body>
      <header>
        <h1>Remote Jobs & Reddit</h1>
      </header>
      <main>
        <div class="remoteJobs">
          <form action="/search">
            <h4>Search by term:</h4>
            <input type="text" name="term">
            <button type="submit">Find my job</button>
          </form>
        </div>
        <div class="redditForm">
          <form action="/reddit">
            <h4>Select the subreddits you're interested on:</h4>
            <ul>
              {% for subreddit in subreddits %}
              <li>
                <input type="checkbox" name="{{subreddit}}" id="{{subreddit}}" />
                <label for="{{subreddit}}">
                  r/{{subreddit}}
                </label>
              </li>
              {% endfor %}
            </ul>
            <button type="submit">Aggregate</button>
          </form>
        </div>
      </main>
    </body>
    
    </html>

     

    reddit.html

    <!DOCTYPE html>
    <html>
    
    <head>
      <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet"></link>
    </head>
    
    <body>
      <header>
        <h1>
          <a href="/">Reddit Reader</a>
        </h1>
        <h3>
          Reading: 
          {% for subreddit in subreddits %}
            r/{{subreddit.language}}
          {% endfor %}
        </h3>
      </header>
      <main>
        {% for subreddit in subreddits %}
          {% for post in subreddit.posts %}
            <div>
              <h3>
                <a href={{post.url}} target="_blank">{{post.article}}</a>
              </h3>
              <h4>{{post.votes}} upvotes · r/{{subreddit.language}}</h4>
              <hr>
            </div>
          {% endfor %}
        {% endfor %}
      </main>
    </body>
    
    </html> 

    참고 자료

    소스 코드

    github.com/zpskek/web_scraper-v2/commit/943bbccfc6c658a139afb1c57799a8e16815a2d7#

    'Project using python > Jobs scrapper' 카테고리의 다른 글

    export.html with Flask framework  (0) 2020.12.21
    search.html  (0) 2020.12.21
    home.html with Flask framework  (0) 2020.12.21
    Scrap remote.com  (0) 2020.12.21
    Scrap WeWorkRemotely  (0) 2020.12.21

    댓글

Designed by Tistory.