-
간만에 동네 친구가 뭔갈 부탁

오 재밌겠다싶어 퇴근하고 바로 작업시작
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
def parse_naver_blog(url):
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(url)
time.sleep(3)
data_list = []
try:
tables = driver.find_elements(By.CSS_SELECTOR, "table.se-table-content")
for tbl in tables:
rows = tbl.find_elements(By.CSS_SELECTOR, "tr.se-tr")
for row in rows:
tds = row.find_elements(By.TAG_NAME, "td")
if len(tds) < 2:
continue
rank = tds[0].text.strip()
second_td = tds[1]
blog_company_name = second_td.text.strip()
jobkorea_link = ""
try:
a_tag = second_td.find_element(By.TAG_NAME, "a")
blog_company_name = a_tag.text.strip()
jobkorea_link = a_tag.get_attribute("href")
except:
pass
if "jobkorea.co.kr/company" not in jobkorea_link:
continue
data_list.append({
"rank": rank,
"blog_company_name": blog_company_name,
"jobkorea_link": jobkorea_link
})
except Exception as e:
print("네이버 블로그 파싱 에러:", e)
finally:
driver.quit()
return data_list
def parse_jobkorea_company(url):
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(url)
time.sleep(3)
data = {
"jobkorea_company_name": "회사이름읎음",
"homepage": "폐업?",
"address": "정보 읎음"
}
try:
full_title = driver.title
data["jobkorea_company_name"] = full_title.split(":")[0].strip()
table = driver.find_element(By.CSS_SELECTOR, ".table-basic-infomation-primary")
rows = table.find_elements(By.CSS_SELECTOR, "tr.field")
for row in rows:
label_elems = row.find_elements(By.CSS_SELECTOR, "th.field-label")
value_elems = row.find_elements(By.CSS_SELECTOR, "td.field-value")
for i in range(len(label_elems)):
label_text = label_elems[i].text.strip()
val_text = value_elems[i].text.strip()
if label_text == "홈페이지":
try:
a_tag = value_elems[i].find_element(By.TAG_NAME, "a")
val_text = a_tag.get_attribute("href")
except:
pass
if label_text == "홈페이지":
data["homepage"] = val_text
elif label_text.startswith("주소"):
data["address"] = val_text
else:
pass
except Exception as e:
print("잡코 파싱 오류:", e)
finally:
driver.quit()
return data
def collect_and_save_data(blog_urls, output_file="blog_jobkorea_result.xlsx"):
all_rows = []
for blog_url in blog_urls:
print(f"[블로그파밍] {blog_url}")
blog_data = parse_naver_blog(blog_url)
for item in blog_data:
rank = item["rank"]
b_name = item["blog_company_name"]
link = item["jobkorea_link"]
print(f" - 링크 파싱: {rank}, {b_name}, {link}")
info = parse_jobkorea_company(link)
row = {
"blog_url": blog_url,
"rank": rank,
"blog_company_name": b_name,
"jobkorea_link": link,
"jobkorea_company_name": info["jobkorea_company_name"],
"homepage": info["homepage"],
"address": info["address"]
}
all_rows.append(row)
if all_rows:
df = pd.DataFrame(all_rows)
df.to_excel(output_file, index=False)
print(f"[완료] 엑셀 저장: {output_file}, 총 {len(all_rows)}건")
else:
print("데이터 ㄴㄴㄴ.")
if __name__ == "__main__":
blog_urls = [
"https://m.blog.naver.com/PostView.naver?blogId=esuccess&logNo=223193200932",
# 다른 블로그 주소...
]
collect_and_save_data(blog_urls, "1.xlsx")
잘됨
아 물런 손 좀 봐야하는데 귀찮
챗지피티 만세임
끗