Links

Get jobs from categories

Tags: #remoteok #jobs #csv #snippet #opendata #dataframe
Author: Sanjeet Attili​
Last update: 2023-04-12 (Created: 2022-03-03)
Description: Remoteok is a job search platform that allows users to find jobs from a variety of categories.

Input

Import libraries

import pandas as pd
import requests
from datetime import datetime
import time

Setup Remoteok

categories = [
"machine learning",
"data science",
"nlp",
"deep learning",
"computer vision",
"data",
"natural language processing",
"data engineer",
]
date_from = -30 ### this is 30 days from now => must be negative

Variables

csv_output = "REMOTIVE_JOBS.csv"

Model

Get jobs from RemoteOk

REMOTEOK_API = "https://remoteok.com/api"
REMOTEOK_DATETIME = "%Y-%m-%dT%H:%M:%S"
NAAS_DATETIME = "%Y-%m-%d %H:%M:%S"
​
​
def get_jobs(remoteok_url, categories):
df = pd.DataFrame()
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
}
index = 0
for tag in categories:
url = remoteok_url + f"?tag={tag}"
res = requests.get(url, headers=headers)
try:
res.raise_for_status()
except requests.HTTPError as e:
return e
​
job_details = res.json()
​
if len(job_details) == 1:
continue
else:
for idx, job in enumerate(job_details):
if idx != 0:
date = job["date"].split("+")[0]
publication_time = datetime.strptime(
date, REMOTEOK_DATETIME
).timestamp()
required_time = (
time.time() + date_from * 24 * 60 * 60
) ### time in seconds
​
if publication_time >= required_time:
df.loc[index, "URL"] = job.get("url")
df.loc[index, "TITLE"] = job.get("position")
df.loc[index, "COMPANY"] = job.get("company")
df.loc[index, "TAGS"] = ", ".join(job.get("tags"))
df.loc[index, "LOCATION"] = job.get("location")
df.loc[index, "PUBLICATION_DATE"] = datetime.fromtimestamp(
publication_time
).strftime(NAAS_DATETIME)
index += 1
​
df = df.sort_values(by="PUBLICATION_DATE", ascending=False)
return df
​
​
df_jobs = get_jobs(REMOTEOK_API, categories)
df_jobs.head(5)

Output

Save dataframe in csv

df_jobs.to_csv(csv_output, index=False)