본문 바로가기
Back-end/Python Scrapper

Flask로 웹스크래퍼 만들기 - 전체 코드

by devraphy 2020. 8. 12.

* 복붙 하시고 코드정리 하시길 바랍니다. 

1. scrapper.py

import requests
from bs4 import BeautifulSoup

LIMIT = 50


def get_last_pages(url):
  result = requests.get(url)
  soup = BeautifulSoup(result.text, "html.parser")
  pagination = soup.find("div", {"class": "pagination"})
  links = pagination.find_all('a')
  pages = []

  for link in links[:-1]:
    pages.append(int(link.string))
    
  max_page = pages[-1]
  return max_page

 
def extract_job(html):
  title = html.find("h2", {"class": "title"}).find("a")["title"]
  company = html.find("span", {"class": "company"})
  #company_anchor = company.find("a")
  location = html.find("span", {"class": "location"}).string
  job_id = html["data-jk"]
  
  #company = str(company.find("a").string)
  
  if company.find("a") is not None:
    company = company.find("a").string
  else:
    company = company.string
    
  company = str(company)
  company = company.strip()
  
  return {
    'title': title,
    'company': company,
    'location': location,
    'link': f"https://kr.indeed.com/viewjob?jk={job_id}"
    }

 

def extract_jobs(last_page, url):
  jobs = []
  
  for page in range(last_page):
    print(f"Scrapping INDEED: Page: {page}")
    result = requests.get(f"{url}&start={page*LIMIT}")
    soup = BeautifulSoup(result.text, "html.parser")
    results = soup.find_all("div", {"class": "jobsearch-SerpJobCard"})
    
    for result in results:
      job = extract_job(result)
      jobs.append(job)
  
  return jobs

 

def get_jobs(word):
  url = f"https://kr.indeed.com/jobs?q={word}&limit={LIMIT}&radius=25"
  last_pages = get_last_pages(url)
  jobs = extract_jobs(last_pages, url)
  return jobs

 


2. templates폴더 - home.html

<!DOCTYPE html>

<html>
  <head>
    <title>Job Search</title>
  </head>
  <body>
    <h1>Job Search</h1>
    <form action="/report" method="get">
      <input placeholder='Search for a job' required name="word"/>
      <button>Search</button>
    </form>
  </body>
</html>

3. templates폴더 - report.html

<!DOCTYPE html>

<html>
  <head>
    <title>Job Search</title>
    <style>
      section {
        display: grid;
        gap:20px;
        grid-template-columns: repeat(4, 1fr);
      }
    </style>
</head>
<body>
  <h1>Search Result</h1>
  <h3>Found {{resultsNumber}} results for: {{searchword}}</h3>
  <a href="/export?word={{searchword}}">Export results to CSV</a>
  <section>
  <h4>Title</h4>
  <h4>Company</h4>
  <h4>Location</h4>
  <h4>Link</h4>
  {% for job in jobs %}
    <span>{{job.title}}</span>
    <span>{{job.company}}</span>
    <span>{{job.location}}</span>
    <a href="{{job.link}}" target="_blank">Apply</a>
  {% endfor %}
  </section>
  
    </form>
  </body>
</html>

  


4. exporter.py

import csv

def save_to_file(jobs):
  file = open("job.csv", mode="w")
  writer = csv.writer(file)
  writer.writerow(["title", "company", "location", "link"])
  
  for job in jobs:
    writer.writerow(list(job.values()))
  return

 


5. main.py

from flask import Flask, render_template, request, redirect, send_file
from scrapper import get_jobs
from exporter import save_to_file

app = Flask("SuperScrapper")

#가짜DB는 꼭 route의 바깥쪽에 있어야 한다
db = {}

@app.route("/")
def home():
    return render_template("home.html")


@app.route("/report")
def report():
    word = request.args.get('word')
    if word:  #검색 단어가 있는 경우
        word = word.lower()  #검색 단어를 모두 소문자로 만들어 주는 함수
        fromDb = db.get(word)  #해당 검색어가 db에 존재하는지 확인

        if fromDb:  #해당 검색어가 db에 존재한다면
            jobs = fromDb  #db에 있는 자료를 출력
        else:  #해당 검색어가 db에 없다면
            jobs = get_jobs(word)  #해당 검색어를 기반으로 자료 추출
            db[word] = jobs  #검색단어에 따라 자료가 저장되도록 한다.

    else:  #검색 단어가 없는 경우
        return redirect("/")  #메인 페이지로 돌아간다
    return render_template(
        'report.html', searchword=word, resultsNumber=len(jobs), jobs=jobs)


@app.route("/export")
def export():
    try:
        word = request.args.get('word')
        if not word:#해당 단어가 검색되지 않았다면 
            raise Exception() #exception을 발생시킨다

        #if문에 걸리지 않았다면 실행되는 함수들
        word = word.lower()  
        jobs = db.get(word) #해당 단어의 자료를 db로부터 받아온다. 

        if not jobs: #db로부터 받아온 자료가 없다면 
          raise Exception() #exception을 발생시킨다.

        save_to_file(jobs) #위의 코드가 아무 문제없는 경우 CSV파일을 생성  
        return send_file('job.csv', mimetype='application/x-csv', attachment_filename='report.csv', as_attachment=True)


    #exception이 일어나면 실행되는 함수 - 메인페이지로 돌아가기    
    except:
        return redirect("/")


app.run(host="0.0.0.0")

댓글