Links

Get jobs from categories

Tags: #remoteok #jobs #csv #snippet #opendata #dataframe
Author: Sanjeet Attili
With this notebook, you will be able to get jobs offer from Remoteok:
  • URL: Job offer url.
  • TITLE: Job title.
  • COMPANY: Company name.
  • TAGS: Tags link to job.
  • LOCATION: Location link to job.
  • PUBLICATION_DATE: Date of publication.

Input

Import libraries

import pandas as pd
import requests
from datetime import datetime
import time

Setup Remoteok

categories = ['machine learning',
'data science',
'nlp',
'deep learning',
'computer vision',
'data',
'natural language processing',
'data engineer']
date_from = -30 ### this is 30 days from now => must be negative

Variables

csv_output = "REMOTIVE_JOBS.csv"

Model

Get jobs from RemoteOk

REMOTEOK_API = "https://remoteok.com/api"
REMOTEOK_DATETIME = "%Y-%m-%dT%H:%M:%S"
NAAS_DATETIME = "%Y-%m-%d %H:%M:%S"
def get_jobs(remoteok_url, categories):
df = pd.DataFrame()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
}
index=0
for tag in categories:
url = remoteok_url + f"?tag={tag}"
res = requests.get(url, headers=headers)
try:
res.raise_for_status()
except requests.HTTPError as e:
return e
job_details = res.json()
if len(job_details)==1:
continue
else:
for idx, job in enumerate(job_details):
if idx!=0:
date = job['date'].split('+')[0]
publication_time = datetime.strptime(date, REMOTEOK_DATETIME).timestamp()
required_time = time.time() + date_from* 24 * 60 * 60 ### time in seconds
if publication_time >= required_time:
df.loc[index, 'URL'] = job.get('url')
df.loc[index, 'TITLE'] = job.get('position')
df.loc[index, 'COMPANY'] = job.get('company')
df.loc[index, 'TAGS'] = ", ".join(job.get('tags'))
df.loc[index, 'LOCATION'] = job.get('location')
df.loc[index, 'PUBLICATION_DATE'] = datetime.fromtimestamp(publication_time).strftime(NAAS_DATETIME)
index+=1
df = df.sort_values(by='PUBLICATION_DATE', ascending=False)
return df
df_jobs = get_jobs(REMOTEOK_API, categories)
df_jobs.head(5)

Output

Save dataframe in csv

df_jobs.to_csv(csv_output, index=False)