-
Extract reddit's posts and make export.html with Flask frameworkProject using python/Jobs scrapper 2020. 12. 21. 14:03
main.py
// main.py from scrapperReddit import get_subreddits app = Flask("Job Scrapper", template_folder="./src/templates") db={} subreddits = [ "javascript", "reactjs", "reactnative", "programming", "css", "golang", "flutter", "rust", "django", ] @app.route('/') def index(): try: return render_template("home.html", subreddits=subreddits) except IOError: return redirect("/") @app.route('/reddit') def reddit(): subreddits = [] try: aggregated_subreddits_dict = request.args.to_dict() subreddits = get_subreddits(aggregated_subreddits_dict) return render_template("reddit.html", subreddits=subreddits) except IOError: return redirect("/")
home 화면에 subreddits 변수를 전달한다. home.html을 보면 전달해준 subreddits를 checkbox 형식으로 form을 만든 것을 확인할 수 있다.
@app.route('/reddit')에서 request.args.to_dict()는 사용자로부터 받은 여러 개의 변수를 dictionary 형태로 받아서 반환한다. 그 다음 scrapperReddit.py에 있는 get_subreddits()로 사용자가 요구한 language의 posts들을 모두 scrap한다. 마지막으로, scrap한 자료들을 reddit.html에 보내면서 reddit.html을 랜더링한다.
scrapperReddit.py
import requests from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36" } def get_post(post): url = post.find("a", {"class", "SQnoC3ObvgnGjWt90zD9Z"}) if url: article = post.find("h3").text votes = post.find("div", {"class", "_1rZYMD_4xY3gRcSS3p8ODO"}).text url = url.attrs["href"] return { "article": article, "votes": votes, "url": f"https://www.reddit.com/{url}", } def get_subreddit(aggregated_subreddit): subreddit = dict(language="", posts=[]) subreddit["language"] = aggregated_subreddit url = f"https://www.reddit.com/r/{aggregated_subreddit}/top/?t=month" result = requests.get(url, headers=headers) soup = BeautifulSoup(result.text, "html.parser") posts_container = soup.find("div", {"class", "rpBJOHq2PR60pnwJlUyP0"}) posts_list = posts_container.find_all("div", {"class": None}, recursive=False) for aPost in posts_list: extracted_post = get_post(aPost) if extracted_post: subreddit["posts"].append(extracted_post) return subreddit def get_subreddits(aggregated_subreddits_dict): subreddits = [] for aggregated_subreddit in aggregated_subreddits_dict: subreddit = get_subreddit(aggregated_subreddit) print(subreddit) subreddits.append(subreddit) return subreddits
home.html
<!DOCTYPE html> <html> <head> <title> Remote Jobs </title> <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet"> </link> </head> <body> <header> <h1>Remote Jobs & Reddit</h1> </header> <main> <div class="remoteJobs"> <form action="/search"> <h4>Search by term:</h4> <input type="text" name="term"> <button type="submit">Find my job</button> </form> </div> <div class="redditForm"> <form action="/reddit"> <h4>Select the subreddits you're interested on:</h4> <ul> {% for subreddit in subreddits %} <li> <input type="checkbox" name="{{subreddit}}" id="{{subreddit}}" /> <label for="{{subreddit}}"> r/{{subreddit}} </label> </li> {% endfor %} </ul> <button type="submit">Aggregate</button> </form> </div> </main> </body> </html>
reddit.html
<!DOCTYPE html> <html> <head> <link href="https://andybrewer.github.io/mvp/mvp.css" rel="stylesheet"></link> </head> <body> <header> <h1> <a href="/">Reddit Reader</a> </h1> <h3> Reading: {% for subreddit in subreddits %} r/{{subreddit.language}} {% endfor %} </h3> </header> <main> {% for subreddit in subreddits %} {% for post in subreddit.posts %} <div> <h3> <a href={{post.url}} target="_blank">{{post.article}}</a> </h3> <h4>{{post.votes}} upvotes · r/{{subreddit.language}}</h4> <hr> </div> {% endfor %} {% endfor %} </main> </body> </html>
참고 자료
소스 코드
github.com/zpskek/web_scraper-v2/commit/943bbccfc6c658a139afb1c57799a8e16815a2d7#
'Project using python > Jobs scrapper' 카테고리의 다른 글
export.html with Flask framework (0) 2020.12.21 search.html (0) 2020.12.21 home.html with Flask framework (0) 2020.12.21 Scrap remote.com (0) 2020.12.21 Scrap WeWorkRemotely (0) 2020.12.21