#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# BY: رضا (User:reza1615 on fa.wikipedia)
# BY: Z (User:ZxxZxxZ on fa.wikipedia)
# Distributed under the terms of the CC-BY-SA 3.0 .
"""
You can run the bot with the following commandline parameters:
-file - Work on all pages given in a local text file.
Will read any [[wiki link]] and use these articles.
Argument can also be given as "-file:filename".
-cat - Work on all pages which are in a specific category.
Argument can also be given as "-cat:categoryname".
-page - Only edit a specific page.
Argument can also be given as "-page:pagetitle". You can give this
parameter multiple times to edit multiple pages.
-ref - Work on all pages that link to a certain page.
Argument can also be given as "-ref:referredpagetitle".
-filelinks - Works on all pages that link to a certain image.
Argument can also be given as "-filelinks:ImageName".
-links - Work on all pages that are linked to from a certain page.
Argument can also be given as "-links:linkingpagetitle".
-start - Work on all pages in the wiki, starting at a given page. Choose
"-start:!" to start at the beginning.
NOTE: You are advised to use -xml instead of this option; this is
meant for cases where there is no recent XML dump.
i.e. "-start:Category:!" or "-start:template:a"
-except:XYZ - Ignore pages which contain XYZ. If the -regex argument is given,
XYZ will be regarded as a regular expression.
-summary:XYZ - Set the summary message text for the edit to XYZ, bypassing the
predefined message texts with original and replacements inserted.
-template:XYZ-
-namespace:n - Number of namespace to process. The parameter can be used
multiple times. It works in combination with all other
parameters, except for the -start parameter. If you e.g. want to
iterate over all user pages starting at User:M, use
-start:User:M.
-always - Don't prompt you for each replacement
-save save on wikifa
-nofa save infobaxs that they don't have any article in fa.wiki
other: -
NOTE: Only use either -xml or -file or -cat or -template or-page, but don't mix them.
"""
try:
import MySQLdb
except:
pass
import wikipedia,sys
import pagegenerators
import re, os, codecs, catlib,string
import query
from collections import defaultdict
wikipedia.config.put_throttle = 0
wikipedia.put_throttle.setDelay()
faSite = wikipedia.getSite('ar')
enSite = wikipedia.getSite('en')
txtTmp=' '
adressfa=u' '
msg = u'بوت نقل قالب المعلومات'
savewiki= False
nofa='onlyen'
boxes=[u'infobox',u'Geobox']
def englishdictionry( enlink ,firstsite,secondsite,check):
try:
enlink=unicode(str(enlink),'UTF-8').replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
except:
enlink=enlink.replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
enlink=enlink.split(u'#')[0].strip()
if enlink==u'':
return False
enlink=enlink.replace(u' ',u'_')
site = wikipedia.getSite(firstsite)
sitesecond= wikipedia.getSite(secondsite)
params = {
'action': 'query',
'prop': 'langlinks',
'titles': enlink,
'redirects': 1,
'lllimit':500,
}
try:
categoryname = query.GetData(params,site, encodeTitle = True)
for item in categoryname[u'query'][u'pages']:
case=categoryname[u'query'][u'pages'][item][u'langlinks']
for item in case:
if item[u'lang']==secondsite:
intersec=item[u'*']
break
if check==True:
secondsitep = wikipedia.Page( sitesecond,intersec)
try:
text = secondsitep.get()
except wikipedia.NoPage:
wikipedia.output(u"%s doesn't exist, skip!" % page.title())
return False
except wikipedia.IsRedirectPage:
wikipedia.output(u"%s is a redirect, skip!" % page.title())
newpage = secondsitep.getRedirectTarget()
intersec=newpage.title()
result=intersec
return result
except:
return False
def redirectquery( enlink,firstsite):
try:
enlink=unicode(str(enlink),'UTF-8').replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
except:
enlink=enlink.replace(u'[[',u'').replace(u']]',u'').replace(u'en:',u'').replace(u'ar:',u'')
enlink=enlink.split(u'#')[0].strip()
if enlink==u'':
return False
enlink=enlink.replace(u' ',u'_')
site = wikipedia.getSite(firstsite)
params = {
'action': 'query',
'titles': enlink,
'redirects':1}
try:
redirectname = query.GetData(params,site, encodeTitle = True)
redirectname=redirectname[u'query'][u'redirects'][0]['to'].replace(u'_',u' ')
return redirectname
except:
enlink=enlink.replace(u'_',u' ')
return enlink
def categorydown(listacategory):
wikipedia.config.put_throttle = 0
wikipedia.put_throttle.setDelay()
count=1
for catname in listacategory:
count+=1
if count==200:
break
gencat = pagegenerators.SubCategoriesPageGenerator(catname, recurse=False)
try:
for subcat in gencat:
try:
wikipedia.output(str(subcat))
except:
wikipedia.output(subcat)
if subcat in listacategory:
continue
else:
listacategory.append(subcat)
except:
continue
return listacategory
def pagefafinder(encatTitle):
cats=[]
try:
item=unicode(str(encatTitle),'Ascii').replace('[[en:','').replace(']]','').replace(' ','_').replace('Category:','')
except:
item=str(encatTitle).replace('[[en:','').replace(']]','').replace(' ','_').replace('Category:','')
#-----------------start sql---------------------------------------
queries ='SELECT /* SLOW_OK */ ll_title FROM page JOIN categorylinks JOIN langlinks WHERE cl_to = "'+item+'" AND cl_from=page_id AND page_namespace = 0 AND page_id =ll_from AND ll_lang = "ar" AND page_namespace = 0 GROUP BY ll_title ;'
site1 = wikipedia.getSite('en')
TS_DB_HOST = 'sql-s3'
MY_CNF = '~/.my.cnf'
cn = MySQLdb.connect("enwiki.labsdb", db = site1.dbName(), read_default_file = '~/.my.cnf')
cur = cn.cursor()
cur.execute(queries)
results = cur.fetchall()
cn.close()
#------------------end of sql--------------------------------------------
for raw in results:
cats.append(raw)
if cats!=[]:
return cats
else:
return False
def encatlist(encat):
encat=encat.replace(u'Category:Category:',u'Category:')
wikipedia.config.put_throttle = 0
wikipedia.put_throttle.setDelay()
count=0
listenpageTitle=[]
PageTitle=encat.replace(u'[[',u'').replace(u']]',u'').strip()
language='en'
PageTitles =[PageTitle]
for PageTitle in PageTitles:
cat = catlib.Category( wikipedia.getSite(language),PageTitle )
if str(cat).find('stubs')!=-1:
continue
listacategory=[cat]
listacategory=categorydown(listacategory)
for enpageTitle in listacategory:
try:
fapages=pagefafinder(enpageTitle)
for pages in fapages:
pages=unicode(pages[0],'UTF-8')
wikipedia.output(u'\03{lightgreen}Adding '+pages+u' to fapage lists\03{default}')
listenpageTitle.append(pages)
except:
try:
enpageTitle=unicode(str(enpageTitle),'UTF-8').split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
except:
enpageTitle=enpageTitle.split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
cat = catlib.Category( wikipedia.getSite(language),enpageTitle )
gent = pagegenerators.CategorizedPageGenerator( cat )
for pagework in gent:
count+=1
try:
link=str(pagework).split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
except:
pagework=unicode(str(pagework),'UTF-8')
link=pagework.split(u'|')[0].split(u']]')[0].replace(u'[[',u'').strip()
listenpageTitle.append(link)
if listenpageTitle==[]:
return False
return listenpageTitle
def boxfind(text_en):
text_en=text_en.replace(u'{{ ',u'{{').replace(u'{{ ',u'{{').replace(u'{{template:',u'{{').replace(u'{{Template:',u'{{')
lines=text_en.split('\n')
start=False
box=u'\n'
diff=1
linebaz,linebasteh=0,0
for our_box in boxes:
our_box=our_box.strip()
up_our_box=our_box[0].upper()+our_box[1:]
lower_our_box=our_box[0].lower()+our_box[1:]
for line in lines:
if line==u'':
continue
if line.find(lower_our_box)!=-1 :# lower case
start=True
linebaz,linebasteh=0,0
box+=u'{{'+lower_our_box+line.split(u'{{'+lower_our_box)[1]+'\n'
linebaz += string.count( line,"{{" )
linebasteh += string.count( line,"}}" )
diff=linebaz-linebasteh
continue
if line.find(up_our_box)!=-1 :# upper case
start=True
linebaz,linebasteh=0,0
box+=u'{{'+up_our_box+line.split(u'{{'+up_our_box)[1]+'\n'
linebaz += string.count( line,"{{" )
linebasteh += string.count( line,"}}" )
diff=linebaz-linebasteh
continue
if start==True and diff!=0:
linebaz += string.count( line,"{{" )
linebasteh += string.count( line,"}}" )
diff=linebaz-linebasteh
box+=line+'\n'
if diff==0 and start==True:
break
return box
def BotRun(page,text_en,nofa,counting):
wikipedia.output(u'------Artcile Number \03{lightblue}'+unicode(str(counting),'UTF-8')+u'\03{default} ----'+unicode(str(page),'UTF-8')+u'------------')
faresult=englishdictionry( page ,'en','ar',False)
if faresult==False and nofa=='onlyfa':
wikipedia.output( u'\03{lightpurple}===>'+unicode(str(page),'UTF-8')+u' in fawiki has no article \03{default}')
return False
if faresult!=False and nofa=='onlyen':
wikipedia.output( u'\03{lightpurple}===>'+unicode(str(page),'UTF-8')+u' in fawiki has article \03{default}')
return False
lines=text_en.split('\n')
matn=' '
for line in lines:
linebaz=string.count(line,'{{')
linebaste=string.count(line,'}}')
diff=linebaz-linebaste
if diff==0:
line=line.replace('{{','$AAAA$').replace('}}','!BBBB!')
linebaz=0
linebaste=0
matn+=line+u'\n'
newtext=''
for our_box in boxes:
our_box=our_box.strip()
try:
newtext= re.search(ur'(\{\{\s*['+our_box[0].lower()+our_box[0].upper()+ur']'+our_box[1:]+ur'[_\s](?:\{\{.*?\}\}|[^\}])*\}\})',matn, re.S).group(1)# if Template box has other name please chang this regex
newtext=newtext.replace(u'$AAAA$',u'{{').replace(u'!BBBB!',u'}}')
break
except:
continue
if not newtext.strip():
newtext=boxfind(text_en)
if not newtext.strip():
wikipedia.output( u'===>'+unicode(str(page),'UTF-8')+u' Without Templatebox' )
return False
wikipedia.output( u'\03{lightgreen}'+unicode(str(page),'UTF-8')+u" added to list \03{default}")
newtext=u'\n@@@\n$$$'+unicode(str(page),'UTF-8')+u'$$$\n'+newtext+u'\n@@@\n'
return newtext
def templatefinder(linkingPageTitle,nofa):
articles=[]
try:
item=unicode(str(linkingPageTitle),'Ascii').replace('[[en:','').replace(']]','').replace(' ','_').replace('Template:','').replace('template:','').replace('قالب:','')
except:
item=str(linkingPageTitle).replace('[[en:','').replace(']]','').replace(' ','_').replace('Template:','').replace('template:','').replace('قالب:','')
#-----------------start sql---------------------------------------
if nofa=='onlyfa':
queries ='SELECT /* SLOW_OK */ page_title FROM page JOIN langlinks WHERE page_namespace = 0 AND page_is_redirect = 0 AND page_id IN (SELECT tl_from FROM templatelinks WHERE tl_title = "'+linkingPageTitle+'" AND tl_namespace = 10) AND ll_lang = "ar" GROUP BY page_title LIMIT 10;'
else:
a=1
#some thing
wikipedia.output(queries)
site1 = wikipedia.getSite('en')
TS_DB_HOST = 'sql-s3'
MY_CNF = '~/.my.cnf'
cn = MySQLdb.connect("enwiki.labsdb", db = site1.dbName(), read_default_file = '~/.my.cnf')
cur = cn.cursor()
cur.execute(queries)
results = cur.fetchall()
cn.close()
#------------------end of sql--------------------------------------------
for raw in results:
articles.append(raw)
if articles!=[]:
return articles
else:
return False
def run(generator,savewiki,adressfa,nofa):
wikipedia.config.put_throttle = 0
wikipedia.put_throttle.setDelay()
site = wikipedia.getSite( 'en' )
with codecs.open( u'tempresult.txt',mode = 'w',encoding = 'utf8' ) as fars:
fars.write( u'\n' )
with codecs.open( u'encats.txt',mode = 'w',encoding = 'utf8' ) as fars2:
fars2.write( u'\n' )
new_text='\n'
counting=0
for pageen in generator:
try:
enlink=unicode(str(pageen),'UTF-8').replace(u'[[',u'').replace(u']]',u'')
except:
enlink=pageen.replace(u'[[',u'').replace(u']]',u'')
if enlink.find(u'talk:')!=-1 or enlink.find(u'Talk:')!=-1 or enlink.find(u'User:')!=-1 or enlink.find(u'Template:')!=-1 or enlink.find(u'Wikipedia:')!=-1 or enlink.find(u'Category:')!=-1:
continue
page = wikipedia.Page( site,enlink )
try:
text_fa = page.get()
catsen=page.categories()
except wikipedia.NoPage:
wikipedia.output( u'Page %s not found' % page.title() )
continue
except wikipedia.IsRedirectPage:
pageRedirect = page.getRedirectTarget()
text_fa = pageRedirect.get()
catsen=pageRedirect.categories()
wikipedia.output( u'Page %s was Redirect but edited!' % pageRedirect )
except:
continue
counting+=1
new_text=BotRun(page,text_fa,nofa,counting)
if new_text==False or new_text=='\n':
continue
with codecs.open( u'tempresult.txt',mode = 'a',encoding = 'utf8' ) as fars:
fars.write( new_text )
catsen=unicode(str(catsen),'UTF-8').replace(u'[Category{',u'').replace(u'}, Category{',u'\n').replace(u']]}]',u']]').replace(u'[[en:',u'[[:en:')+u'\n'
with codecs.open( u'encats.txt',mode = 'a',encoding = 'utf8' ) as encatfile:
encatfile.write( catsen )
def linktranslation():
farsichar=u'ابضصثقفغعهخحجچشسیلتنمکگظطزرذدپو۱۲۳۴۵۶۷۸۹۰'
wikipedia.output(u'\03{lightgreen} Translating Links .... \03{default}')
text2 = codecs.open( u'tempresult.txt','r' ,'utf8' )
text = text2.read()
linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
for item in linken:
passport=True
if not item in text:
continue
if item.find(u'File:')!=-1 or item.find(u'file:')!=-1 or item.find(u'Image:')!=-1 or item.find(u'image:')!=-1 or item.find(u'Category:')!=-1 or item.find(u'category:')!=-1:
continue
for i in farsichar:
if i in item:
passport=False
break
if not passport:
continue
itemmain=item
item=item.replace(u'en:',u'')
if item.find('user:')!=-1 or item.find('User:')!=-1 or item.find('template:')!=-1 or item.find('Template:')!=-1 or item.find('category:')!=-1 or item.find('Category:')!=-1 or item.find('Wikipedia:')!=-1 or item.find('wikipedia:')!=-1 or item.find('Talk:')!=-1 or item.find('talk:')!=-1 or item.find('Help:')!=-1 or item.find('help:')!=-1:
continue
itemen=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').strip()
if text.find(itemmain)!=-1:
itemfa=englishdictionry(itemen ,'en','ar',False)
wikipedia.output(itemen)
else:
continue
if itemfa==False:
itemen=item.replace(u'[[',u'').replace(u']]',u'').strip()
itemen=itemen.replace(u'[[',u'').replace(u']]',u'')
text=text.replace(u'[['+itemen+u']]',u'@1@'+itemen+u'@2@')
continue
else:
text=text.replace(itemmain,u'@1@'+itemfa+u'@2@')
linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
wikipedia.output(str(len(linken))+u' is remained')
text=text.replace(u'@1@',u'[[').replace(u'@2@',u']]')
text=text.replace(u'$$$[[',u"'''").replace(u']]$$$',u"'''")
text=text.replace(u'@@@\n\n@@@',u'@@@\n@@@').strip()
with codecs.open( u'NeededBoxes.txt',mode = 'w',encoding = 'utf8' ) as f:
f.write(text)
wikipedia.output(u'\03{lightblue} Translated Boxes are copied in NeededBoxes.txt \03{default}')
def main(savewiki,adressfa,nofa):
summary_commandline,template,gen = None,None,None
exceptions,PageTitles,namespaces = [],[],[]
linkingPageTitle=''
encat=''
autoText,autoTitle = False,False
genFactory = pagegenerators.GeneratorFactory()
wikipedia.setAction( msg )
for arg in wikipedia.handleArgs():
if arg == '-autotitle':
autoTitle = True
elif arg == '-autotext':
autoText = True
elif arg.startswith( '-page:' ):
if len(arg) == 6:
PageTitles.append(wikipedia.input( u'Which page do you want to chage?' ))
else:
PageTitles.append(arg[6:])
elif arg.startswith( '-cat:' ):
if len(arg) == 5:
encat=wikipedia.input( u'Which Category do you want to chage?' )
else:
encat='Category:'+arg[5:]
elif arg.startswith('-except:'):
exceptions.append(arg[8:])
elif arg.startswith( '-namespace:' ):
namespaces.append( int( arg[11:] ) )
elif arg.startswith( '-ns:' ):
namespaces.append( int( arg[4:] ) )
elif arg.startswith( '-ref:' ):
linkingPageTitle= arg[5:]
elif arg.startswith('-file:'):
textfilename = arg[6:]
if not textfilename:
textfilename = wikipedia.input(
u'Please enter the local file name:')
gen = pagegenerators.TextfilePageGenerator(textfilename,'en')
elif arg.startswith( '-nofa:' ):
nofa=arg[6:].strip()
nofa=nofa.replace(u'faonly',u'onlyfa').replace(u'enonly',u'onlyen')
if nofa!='onlyfa':
nofa='onlyen'
elif arg.startswith( '-onlyfa' ):
nofa='onlyfa'
elif arg.startswith( '-onlyen' ):
nofa='onlyen'
elif arg.startswith( '-save' ):
savewiki= True
adressfa= wikipedia.input(u'Write you wiki subpage like (User:yourusername/findbox) :> ').decode('utf-8')
else:
generator = genFactory.handleArg(arg)
if generator:
gen = generator
#--------------------------------------------------------------------the template name----------------
#linkingPageTitle=u'Template:Infobox school'
#--------------------------------------------------------------------------------------------------------
if encat!='':
encatfalist=encatlist(encat)
if encatfalist!=False:
run(encatfalist,savewiki,adressfa,nofa)
if PageTitles:
pages = [wikipedia.Page(enSite,PageTitle) for PageTitle in PageTitles]
gen = iter( pages )
if linkingPageTitle:
linkingPage = wikipedia.Page(enSite, linkingPageTitle)
pages=pagegenerators.ReferringPageGenerator(linkingPage)
gen = iter(pages)
wikipedia.output(linkingPageTitle)
if namespaces != []:
gen = pagegenerators.NamespaceFilterPageGenerator( gen,namespaces )
preloadingGen = pagegenerators.PreloadingGenerator( gen,pageNumber = 60 )
run(preloadingGen,savewiki,adressfa,nofa)
def catenlist():
wikipedia.output(u'\03{lightgreen} Listing Needed categories.... \03{default}')
text2 = codecs.open( u'encats.txt','r' ,'utf8' )
text = text2.read()
linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
for item in linken:
itemmain=item
item=item.replace(u':en:',u'').replace(u'en:',u'')
itemen=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').strip()
if text.find(itemmain)!=-1:
itemfa=englishdictionry(itemen ,'en','ar',False)
wikipedia.output(itemen)
else:
continue
if itemfa==False:
itemen=item.replace(u'[[',u'').replace(u']]',u'').strip()
itemen=itemen.replace(u'[[',u'').replace(u']]',u'')
text=text.replace(u'[['+itemen+u']]',u'@1@'+itemen+u'@2@')
continue
else:
text=text.replace(itemmain,u'')
text=text.replace(u'[[]]',u'').replace(u'@1@@2@',u'')
linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
wikipedia.output(str(len(linken))+u' is remained')
text=text.replace(u'@1@',u'[[').replace(u'@2@',u']]')
textup=u"التصانيف المعادلة التي يجب إنشاؤها.\n"
textup+=u'{| class="wikitable sortable"\n!الرقم!!التصنيف!!مرات الأستخدام\n'
linken = re.findall(ur'\[\[.*?\]\]',text, re.S)
counters=0
dict={}
for item in linken:
if text.lower().count(item.lower())==0:
continue
dict[item]=text.count(item)
counters=0
for key, value in sorted(dict.iteritems(), key=lambda (k,v): (v,k), reverse=True):
counters+=1
textup+=u'|-\n|'+str(counters)+u'||'+key+u'||'+str(value)+u'\n'
text=text.replace(item,u'').replace(u'[[]]',u'').replace(u'[[',u'[[:en:')
textup+=u'|-\n|}'
savefile='NeededCategories.txt'
with codecs.open(savefile ,mode = 'w',encoding = 'utf8' ) as f:
f.write(textup)
wikipedia.output(u"\03{lightblue} Needed Category's File is made and it's name is "+unicode(savefile,'UTF-8')+u" \03{default}")
def mainarticles():
wikipedia.output(u'\03{lightgreen} Listing Needed Articles....\03{default}')
farsichar=u'ابضصثقفغعهخحجچشسیلتنمکگظطزرذدپو۱۲۳۴۵۶۷۸۹۰'
filesample = 'NeededBoxes.txt'
text2 = codecs.open( filesample,'r' ,'utf8' )
text = text2.read()
linken =re.findall(ur'\[\[.*?\]\]',text, re.S)
enlinks=[]
for item in linken:
passport=True
if not item in text:
continue
if item.find(u'File:')!=-1 or item.find(u'file:')!=-1 or item.find(u'Image:')!=-1 or item.find(u'image:')!=-1 or item.find(u'Category:')!=-1 or item.find(u'category:')!=-1:
continue
for i in farsichar:
if i in item:
passport=False
break
if passport:
itemnew=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').replace(u':en:',u'').replace(u'en:',u'').strip()
redirect=redirectquery(itemnew,'en')
if redirect:
enlink=u'AAA'+redirect+u'$$$'
else:
continue
text=text.replace(item,enlink)
linken =re.findall(ur'\[\[.*?\]\]',text, re.S)
wikipedia.output(item)
wikipedia.output(str(len(linken))+u' is remained')
text=text.replace(u'AAA',u'[[').replace(u'$$$',u']]')
linken =re.findall(ur'\[\[.*?\]\]',text, re.S)
for item in linken:
passport=True
itemmain=item
item=item.replace(u':en:',u'').replace(u'en:',u'')
item=item.split(u'|')[0].replace(u'[[',u'').replace(u']]',u'').strip()
for i in farsichar:
if i in item:
passport=False
break
if item.find(u'File:')!=-1 or item.find(u'file:')!=-1 or item.find(u'Image:')!=-1 or item.find(u'image:')!=-1 or item.find(u'Category:')!=-1 or item.find(u'category:')!=-1:
continue
if text.find(u'[['+item)!=-1 and passport:
if not item in enlinks:
enlinks.append(item)
textup=u"المقالات يجب إنشاؤها كي لاتبقى وصلة حمراء في المقالات.\n"
textup+=u'{|class="wikitable sortable"\n!ردیف!!نام مقاله!!تعداد استفاده\n'
dict={}
for item in enlinks:
if text.count(u'[['+item)==0:
continue
dict[item]=text.count(u'[['+item)
counters=0
for key, value in sorted(dict.iteritems(), key=lambda (k,v): (v,k), reverse=True):
counters+=1
textup+=u'|-\n|'+str(counters)+u'||[[:en:'+key+u']]||'+str(value)+u'\n'
textup+=u'|-\n|}'
savefile='NeededArticles.txt'
with codecs.open( savefile,mode = 'w',encoding = 'utf8' ) as f:
f.write( textup )
wikipedia.output(u"\03{lightblue} Needed Article's File is made and it's name is "+unicode(savefile,'UTF-8')+u" \03{default}")
def exceleporter():
wikipedia.output(u'\03{lightgreen} Making excel File.... \03{default}')
count = 0
filesample = 'NeededBoxes.txt'
text2 = codecs.open( filesample,'r' ,'utf8' )
text = text2.read()
#-------------------------------------------------------
#قسم الخانات
# يجب كتابة خانات قالب المعلومات التي يجب استخراجها
#items=(u'number',u'organ',u'date',u'year',u'meeting',u'code',u'document',u'for',u'abstention',u'against',u'subject',u'result',u'image',u'caption')
#items=(u'name',u'image',u'caption',u'fullname',u'birth_date',u'birth_place',u'death_date',u'death_place',u'height',u'position',u'currentclub',u'youthyears1',u'youthclubs1',u'collegeyears1',u'collegeclubs1',u'collegecaps1',u'collegegoals1',u'years1',u'clubs1',u'caps1',u'goals1',u'totalcaps',u'totalgoals',u'nationalyears1',u'nationalteam1',u'nationalcaps1',u'nationalgoals1',u'medaltemplates',u'manageryears1',u'managerclubs1',u'club-update',u'nationalteam-update')
#items=(u'name',u'official_name',u'image',u'image_size',u'image_caption',u'image_alt',u'location_map',u'location_map_width',u'location_map_text',u'lat_d',u'lat_m',u'lat_s',u'lat_NS',u'long_d',u'long_m',u'long_s',u'long_EW',u'coordinates_type',u'coordinates_display',u'coordinates_ref',u'country',u'location',u'status',u'construction_began',u'commissioned',u'licence_expires',u'decommissioned',u'cost',u'owner',u'operator',u'developer',u'constructor',u'reactors_operate_mw',u'reactors_const_mw',u'reactors_planned_mw',u'reactors_decom_mw',u'reactors_cancel_mw',u'reactor_type',u'reactor_supplier',u'turbine_manu_npp',u'installed_capacity',u'max_planned_cap',u'capacity_factor',u'average_annual_gen',u'net_generation',u'website',u'as_of',u'extra')
#items=(u'name ',u'image_skyline',u'image_map',u'latd',u'longd',u'area_total_km2',u'population_total',u'population_as_of',u'population_density_km2',u'timezone1',u'postal_code',u'website',u'image') #City
items=(u'number',u'award',u'image',u'caption',u'date',u'site',u'host',u'producer',u'director',u'best_picture',u'most_wins',u'most_nominations',u'network',u'duration',u'ratings',u'last',u'next')
#-------------------------------------------------------
for a in range(0,30):
text=text.replace(u' =',u'=').replace(u'| ',u'|').replace(u'= ',u'=')
text=text.replace(u'@@@\n\n@@@',u'@@@').replace(u'\t',u'')
dict={}
count=-1
text=re.sub(ur'\<ref(.*?)\/ref\>',ur"", text, re.S)
text=re.sub(ur'\<ref(.*?)\/>',ur"", text, re.S)
count=-1
for item in items:
count+=1
dict[count]=[item]
dict[count+1]=[u'names']
for pag in text.split(u'@@@' ):
pag=pag.replace(u'\r',u'')
if pag.strip()==u'':
continue
try:
onvan=pag.split("'''")[1].strip()
except:
continue
count=-1
for item in items:
count+=1
try:
im = re.search(ur'\|'+item+u'\=.*?\\n\|', pag)
itema=im.group(0).split(item+u'=')[1].replace(u'\n|',u'').replace(u'\n',u'').replace(u"''",u"").strip()
itema=itema.split(u'<')[0].strip()
if itema==u'N/A' or itema==u'*' or itema==u'':
itema=u' '
except:
itema=u' '
dict[count].append(itema)
onvan=pag.split("'''")[1].replace(u'en:',u'').strip()
dict[count+1].append(onvan)
number=len(items)+1
total=len(dict[0])-1
text=u'\n'
for b in range(0,total):
for a in range(0,number):
text+=dict[a][b]+u'\t'
text+=u'\n'
savefile='resultexcelc.txt'
with codecs.open(savefile ,mode = 'w',encoding = 'utf8' ) as f:
f.write( text.strip() )
f.close()
wikipedia.output(u"\03{lightblue} Excel File is made and it's name is "+unicode(savefile,'UTF-8')+u"\03{default}")
if __name__ == "__main__":
try:
main(savewiki,adressfa,nofa)
except:
pass
linktranslation()
exceleporter()
catenlist()
mainarticles()