다운로드
작성자: admin 작성일시: 2016-07-07 16:33:27 조회수: 2332 다운로드: 356
카테고리: Python 태그목록:

Python을 활용한 웹 정보 수집

urllib 패키지

  • urlencode : URL 인수 문자열 생성
  • urlopen : 웹서버 연결
  • urlretrieve : 웹서버 연결 및 HTML 문서 저장
In:
import urllib
In:
params = urllib.urlencode({"a": 4, "b": 20, "c": 2016, "d": 6, "e": 30, "f": 2016, "s": "^KS11"})
params
Out:
'a=4&c=2016&b=20&e=30&d=6&f=2016&s=%5EKS11'
In:
url = 'http://ichart.finance.yahoo.com/table.csv?g=d&ignore=.csv&%s' % params
data = urllib.urlopen(url).read()
print(data)
Date,Open,High,Low,Close,Volume,Adj Close
2016-07-06,1980.109985,1982.800049,1944.329956,1953.119995,457400,1953.119995
2016-07-05,1995.689941,1996.77002,1985.939941,1989.849976,405500,1989.849976
2016-07-04,1989.439941,1997.369995,1985.109985,1995.300049,468600,1995.300049
2016-07-01,1977.359985,1993.97998,1975.890015,1987.319946,448100,1987.319946
2016-06-30,1971.540039,1972.810059,1959.280029,1970.349976,349900,1970.349976
2016-06-29,1946.310059,1966.209961,1941.900024,1956.359985,451200,1956.359985
2016-06-28,1907.709961,1937.98999,1907.48999,1936.219971,373500,1936.219971
2016-06-27,1901.849976,1926.869995,1900.829956,1926.849976,394300,1926.849976
2016-06-24,2001.550049,2001.550049,1892.75,1925.23999,750500,1925.23999
2016-06-23,1989.420044,1990.709961,1980.140015,1986.709961,455400,1986.709961
2016-06-22,1981.780029,1994.76001,1979.369995,1992.579956,407400,1992.579956
2016-06-21,1977.800049,1983.869995,1971.98999,1982.699951,557400,1982.699951
2016-06-20,1974.430054,1984.069946,1969.560059,1981.119995,391300,1981.119995
2016-06-17,1966.22998,1970.540039,1949.599976,1953.400024,485300,1953.400024
2016-06-16,1971.530029,1974.01001,1944.800049,1951.98999,438800,1951.98999
2016-06-15,1972.00,1975.869995,1960.969971,1968.829956,376800,1968.829956
2016-06-14,1976.800049,1982.319946,1965.339966,1972.030029,439900,1972.030029
2016-06-13,2001.030029,2001.790039,1976.689941,1979.060059,443000,1979.060059
2016-06-10,2022.77002,2022.77002,2014.150024,2017.630005,404300,2017.630005
2016-06-09,2028.02002,2035.27002,2012.140015,2024.170044,599700,2024.170044
2016-06-08,2013.719971,2027.089966,2008.589966,2027.079956,497900,2027.079956
2016-06-07,1993.079956,2011.670044,1992.77002,2011.630005,481300,2011.630005
2016-06-03,1991.579956,1991.579956,1979.660034,1985.839966,540900,1985.839966
2016-06-02,1983.859985,1988.00,1978.819946,1985.109985,474200,1985.109985
2016-06-01,1976.869995,1986.76001,1975.819946,1982.719971,502300,1982.719971
2016-05-31,1962.469971,1984.180054,1958.160034,1983.400024,657000,1983.400024
2016-05-30,1969.719971,1970.540039,1955.48999,1967.130005,550500,1967.130005
2016-05-27,1962.329956,1971.27002,1959.75,1969.170044,523500,1969.170044
2016-05-26,1965.810059,1967.449951,1955.969971,1957.060059,558300,1957.060059
2016-05-25,1953.630005,1964.910034,1951.170044,1960.51001,656900,1960.51001
2016-05-24,1951.180054,1951.390015,1937.670044,1937.680054,517100,1937.680054
2016-05-23,1954.26001,1956.77002,1940.439941,1955.25,399500,1955.25
2016-05-20,1943.839966,1951.50,1940.359985,1947.670044,388100,1947.670044

In:
%cd /home/dockeruser
urllib.urlretrieve(url, "/home/dockeruser/kospi.csv")
%ls
/home/dockeruser
anaconda2/  data/  kospi.csv  notebooks/  scikit_learn_data/
In:
csv = open("/home/dockeruser/kospi.csv")
csv.readlines()[:5]
Out:
['Date,Open,High,Low,Close,Volume,Adj Close\n',
 '2016-07-06,1980.109985,1982.800049,1944.329956,1953.119995,457400,1953.119995\n',
 '2016-07-05,1995.689941,1996.77002,1985.939941,1989.849976,405500,1989.849976\n',
 '2016-07-04,1989.439941,1997.369995,1985.109985,1995.300049,468600,1995.300049\n',
 '2016-07-01,1977.359985,1993.97998,1975.890015,1987.319946,448100,1987.319946\n']

requests 패키지

In:
import requests
url = "https://www.google.com/finance/historical?q=KRX%3AKOSPI200"
req = requests.get(url)
print(req.text[3000:4000])
Lrg-RaWvg" class=invfr tabindex="-1">

In:
from bs4 import BeautifulSoup
soup = BeautifulSoup(req.text, 'lxml')
In:
import dateutil

list_records = []
table = soup.find("table", class_="historical_price")
for i, r in enumerate(table.find_all('tr')):
    for j, c in enumerate(r.find_all('td')):
        if j == 0:
            record = {"date": dateutil.parser.parse(c.text.strip())}
        elif j == 1:
            record.update({"open": float(c.text.strip())})
        elif j == 2:
            record.update({"high": float(c.text.strip())})
        elif j == 3:
            record.update({"low": float(c.text.strip())})
        elif j == 4:
            record.update({"close": float(c.text.strip())})
        elif j == 5:
            record.update({"volume": int(c.text.strip().replace(',',''))})
    list_records.append(record)
In:
list_records[:5]
Out:
[{'close': 240.91,
  'date': datetime.datetime(2016, 5, 25, 0, 0),
  'high': 241.45,
  'low': 239.28,
  'open': 239.68,
  'volume': 91189000},
 {'close': 241.86,
  'date': datetime.datetime(2016, 7, 6, 0, 0),
  'high': 245.74,
  'low': 240.73,
  'open': 245.37,
  'volume': 71672000},
 {'close': 246.91,
  'date': datetime.datetime(2016, 7, 5, 0, 0),
  'high': 247.84,
  'low': 246.51,
  'open': 247.67,
  'volume': 54810000},
 {'close': 247.62,
  'date': datetime.datetime(2016, 7, 4, 0, 0),
  'high': 247.94,
  'low': 246.16,
  'open': 246.66,
  'volume': 63633000},
 {'close': 246.52,
  'date': datetime.datetime(2016, 7, 1, 0, 0),
  'high': 247.57,
  'low': 244.78,
  'open': 244.96,
  'volume': 63045000}]
In:
df = pd.DataFrame(list_records,
                  columns=["date", "open", "high", "low", "close", "volume"])
df.tail()
Out:
date open high low close volume
26 2016-05-31 241.03 243.84 240.36 243.63 117966000
27 2016-05-30 241.94 242.07 240.41 241.73 67446000
28 2016-05-27 241.25 242.41 240.74 241.85 92225000
29 2016-05-26 241.56 242.05 240.45 240.58 130282000
30 2016-05-25 239.68 241.45 239.28 240.91 91189000
In:
import lxml.html 
tree = lxml.html.fromstring(req.text)
In:
import dateutil
dates = [dateutil.parser.parse(x.text.strip()) for x in tree.xpath('//td[@class="lm"]')]
volumes = np.array([int(x.text.strip().replace(',','')) for x in tree.xpath('//td[@class="rgt rm"]')])
prices = np.reshape([float(x.text.strip()) for x in tree.xpath('//td[@class="rgt"]')], (-1, 4))
price_o = prices[:,0]
price_h = prices[:,1]
price_l = prices[:,2]
price_c = prices[:,3]
In:
df = pd.DataFrame({"date": dates, "open": price_o, "high": price_h, "low": price_l, "close": price_c, "volume": volumes},
                  columns=["date", "open", "high", "low", "close", "volume"])
df.tail()
Out:
date open high low close volume
25 2016-05-31 241.03 243.84 240.36 243.63 117966000
26 2016-05-30 241.94 242.07 240.41 241.73 67446000
27 2016-05-27 241.25 242.41 240.74 241.85 92225000
28 2016-05-26 241.56 242.05 240.45 240.58 130282000
29 2016-05-25 239.68 241.45 239.28 240.91 91189000

질문/덧글

아직 질문이나 덧글이 없습니다. 첫번째 글을 남겨주세요!